GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | vgg4_conv0_fwd | Convolution | [256,3,224,224] | 10951675.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11713.00 | 4110417920 | 66951680.00 | 1515475381.33 | 50.10 | 2.60 | 350.93 | true | 0.497684;0.496364;0.508102;0.498346;0.722906 | 4110417920;4110417920;4110417920;4110417920;4110417920 | 67067648;65167392;64460224;68620000;192726944 | 1531011264;1518560704;1496854176;1594865472;951781664 | |
0 | vgg4_conv0_fwd | Convolution | [256,3,224,224] | 10951675.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 5486.67 | 54257516544 | 15136405.33 | 1095510624.00 | 12.40 | 48.85 | 9888.98 | false | 0.123469;0.123110;0.125409;0.123741;0.144534 | 54257516544;54257516544;54257516544;54257516544;54257516544 | 14835200;15033824;15280384;15095008;22134112 | 1073003712;1090086592;1084232000;1112213280;1690745344 | |
0 | vgg4_conv0_fwd | Convolution | [256,3,224,224] | 10951675.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 5.00 | 0 | 2432.00 | 935232.00 | 23.00 | 0.00 | 0.00 | true | 0.244444;0.227180;0.230448;0.227016;0.232479 | 0;0;0;0;0 | 2432;2432;2432;2432;2464 | 933472;936544;935680;932832;941920 | |
1 | vgg4_relu0_fwd | Activation | [256,64,224,224] | 167840.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11393.67 | 1644167168 | 140202325.33 | 676786485.33 | 49.90 | 2.01 | 144.31 | true | 0.500157;0.494627;0.499906;0.498816;0.499297 | 1644167168;1644167168;1644167168;1644167168;1644167168 | 140060736;140251456;139627552;140294784;310430784 | 677898368;674017952;674238656;678222432;1456188384 | |
2 | vgg4_conv1_fwd | Convolution | [256,64,224,224] | 66633039 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 42933.00 | 445980344320 | 72862048.00 | 143479328.00 | 12.40 | 2061.47 | 10387.82 | false | 0.123286;0.124167;0.120913;0.123910;0.124160 | 445980344320;445980344320;445980344320;445980344320;445980344320 | 76687072;73191552;73231104;71514176;72163488 | 146057344;142223328;142808256;143811360;143818368 | |
2 | vgg4_conv1_fwd | Convolution | [256,64,224,224] | 66633039 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11722.33 | 4110417920 | 414488960.00 | 263354218.67 | 15.10 | 6.06 | 350.65 | true | 0.150771;0.151789;0.150038;0.151848;0.150978 | 4110417920;4110417920;4110417920;4110417920;4110417920 | 261171008;263931808;263940896;263256000;262874848 | 404739136;419657600;414360992;421222336;409448288 | |
2 | vgg4_conv1_fwd | Convolution | [256,64,224,224] | 66633039 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 237568 | 150634.67 | 291530.67 | 6.20 | 0.54 | 39.59 | true | 0.062441;0.062441;0.062441;0.062442;0.062442 | 237568;237568;237568;237568;237568 | 157120;150464;150720;150464;150720 | 302112;289568;289568;289568;295456 | |
3 | vgg4_relu1_fwd | Activation | [256,64,224,224] | 174197.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11289.67 | 1644167168 | 140942762.67 | 678188768.00 | 49.70 | 2.01 | 145.63 | true | 0.497424;0.497003;0.488071;0.500172;0.497944 | 1644167168;1644167168;1644167168;1644167168;1644167168 | 681537824;680302112;659901632;672726368;711405920 | 140744032;142841056;138157216;139243200;148818976 | |
4 | vgg4_pool0_fwd | Pooling | [256,64,224,224] | 1740752 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5045.00 | 205520896 | 235481845.33 | 295895626.67 | 37.30 | 0.39 | 40.74 | true | 0.372344;0.373639;0.376062;0.374339;0.370667 | 205520896;205520896;205520896;205520896;205520896 | 231561408;238304704;235503136;235823168;235119232 | 296891680;295466720;293963744;295328480;297911584 | |
5 | vgg4_conv2_fwd | Convolution | [256,64,112,112] | 28385105.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20708.33 | 222990172160 | 40339616.00 | 149531914.67 | 12.40 | 1174.43 | 10768.14 | false | 0.123438;0.124004;0.124498;0.123582;0.123435 | 222990172160;222990172160;222990172160;222990172160;222990172160 | 40533760;38730272;39377152;122527232;41107936 | 146896064;144677568;149190592;262144704;152509088 | |
5 | vgg4_conv2_fwd | Convolution | [256,64,112,112] | 28385105.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5309.67 | 2055208960 | 256066602.67 | 302371893.33 | 15.70 | 3.68 | 387.07 | true | 0.172039;0.148065;0.150144;0.148425;0.171999 | 2055208960;2055208960;2055208960;2055208960;2055208960 | 271097920;270805568;391545216;266471968;365212192 | 203473472;212503712;451790688;247670688;308025408 | |
5 | vgg4_conv2_fwd | Convolution | [256,64,112,112] | 28385105.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 475136 | 295104.00 | 103722.67 | 6.20 | 1.19 | 95.03 | true | 0.062341;0.062328;0.062332;0.062343;0.062331 | 475136;475136;475136;475136;475136 | 111104;108928;101888;100352;99328 | 295104;295104;295104;295104;295104 | |
6 | vgg4_relu2_fwd | Activation | [256,128,112,112] | 87146 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5043.00 | 822083584 | 139638026.67 | 677594282.67 | 55.00 | 1.01 | 163.01 | true | 0.562698;0.543091;0.519265;0.545455;0.956894 | 822083584;822083584;822083584;822083584;822083584 | 138069664;146245664;137767008;140448320;140396096 | 673114528;683091168;665959840;681859392;677808928 | |
7 | vgg4_conv3_fwd | Convolution | [256,128,112,112] | 57572437.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 38541.67 | 441664405504 | 46236405.33 | 79791008.00 | 12.30 | 3504.51 | 11459.40 | false | 0.119627;0.123478;0.124130;0.123809;0.120651 | 441664405504;441664405504;441664405504;441664405504;441664405504 | 46130464;46928032;46764640;44717824;45814112 | 79430464;79712512;79811712;80515008;79848800 | |
7 | vgg4_conv3_fwd | Convolution | [256,128,112,112] | 57572437.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5295.33 | 2055208960 | 216612053.33 | 151068394.67 | 15.80 | 5.59 | 388.12 | true | 0.151395;0.172654;0.151216;0.172892;0.151064 | 2055208960;2055208960;2055208960;2055208960;2055208960 | 217408128;209314816;217977952;332877920;214450080 | 148434720;149649440;151353536;211139168;152202208 | |
7 | vgg4_conv3_fwd | Convolution | [256,128,112,112] | 57572437.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 950272 | 590272.00 | 1044128.00 | 9.80 | 0.58 | 158.38 | true | 0.098430;0.098253;0.098439;0.098064;0.098571 | 950272;950272;950272;950272;950272 | 590528;590272;590272;590272;590272 | 1037088;1033248;1047712;1047840;1047584 | |
8 | vgg4_relu3_fwd | Activation | [256,128,112,112] | 84439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5087.33 | 822083584 | 142164458.67 | 679025536.00 | 57.10 | 1.00 | 161.59 | true | 0.551386;0.547135;0.614867;0.547756;0.997844 | 822083584;822083584;822083584;822083584;822083584 | 140554240;138510240;148988352;138772832;147166304 | 675386208;665120480;692680800;669009600;694100576 | |
9 | vgg4_pool1_fwd | Pooling | [256,128,112,112] | 810296.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 2574.67 | 102760448 | 233917888.00 | 280934090.67 | 72.70 | 0.20 | 39.91 | true | 0.727026;0.727056;0.727014;0.727027;0.727014 | 102760448;102760448;102760448;102760448;102760448 | 236822240;248800128;231209152;228927360;233722272 | 283106208;256089280;282647264;279145344;281009664 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2354.33 | 1027604480 | 135150293.33 | 554967701.33 | 99.90 | 1.49 | 436.47 | true | 0.998986;0.998993;0.998967;0.998998;0.998876 | 1027604480;1027604480;1027604480;1027604480;1027604480 | 558331488;552263584;45289504;556676768;555962752 | 137337664;132663744;22200608;136476416;136310720 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 2310.73 | 28120186880 | 44595384.00 | 186879938.67 | 17.00 | 121.48 | 12169.41 | false | 0.171041;0.170565;0.165917;0.169130;0.172644;0.169476;0.171850;0.169841;0.171470;0.170654;0.169760;0.172263;0.170433;0.172469;0.168228;0.168099;0.167571;0.169037;0.170665;0.168419 | 28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880 | 187781088;178175360;188238976;186170432;186884128;187038240;186713184;185543328;188297536;186510368;89462304;187059232;188584960;185451680;187903584;187536960;185323392;187967040;7697024;188292416 | 45436320;44700160;44598528;44848192;44477536;44758592;44349856;44186304;45860512;44304448;45301568;44330816;44254400;44065568;45392000;43906272;44863680;44356832;2996096;45726752 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 2310.73 | 28120186880 | 44595384.00 | 186879938.67 | 17.00 | 121.48 | 12169.41 | false | 0.171041;0.170565;0.165917;0.169130;0.172644;0.169476;0.171850;0.169841;0.171470;0.170654;0.169760;0.172263;0.170433;0.172469;0.168228;0.168099;0.167571;0.169037;0.170665;0.168419 | 28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880 | 45436320;44700160;44598528;44848192;44477536;44758592;44349856;44186304;45860512;44304448;45301568;44330816;44254400;44065568;45392000;43906272;44863680;44356832;2996096;45726752 | 187781088;178175360;188238976;186170432;186884128;187038240;186713184;185543328;188297536;186510368;89462304;187059232;188584960;185451680;187903584;187536960;185323392;187967040;7697024;188292416 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 2310.00 | 28120186880 | 44595384.00 | 186879938.67 | 17.00 | 121.48 | 12173.24 | false | 0.171041;0.170565;0.165917;0.169130;0.172644;0.169476;0.171850;0.169841;0.171470;0.170654;0.169760;0.172263;0.170433;0.172469;0.168228;0.168099;0.167571;0.169037;0.170665;0.168419 | 28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880 | 187781088;178175360;188238976;186170432;186884128;187038240;186713184;185543328;188297536;186510368;89462304;187059232;188584960;185451680;187903584;187536960;185323392;187967040;7697024;188292416 | 45436320;44700160;44598528;44848192;44477536;44758592;44349856;44186304;45860512;44304448;45301568;44330816;44254400;44065568;45392000;43906272;44863680;44356832;2996096;45726752 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 2310.00 | 28120186880 | 44595384.00 | 186879938.67 | 17.00 | 121.48 | 12173.24 | false | 0.171041;0.170565;0.165917;0.169130;0.172644;0.169476;0.171850;0.169841;0.171470;0.170654;0.169760;0.172263;0.170433;0.172469;0.168228;0.168099;0.167571;0.169037;0.170665;0.168419 | 28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880;28120186880 | 187781088;178175360;188238976;186170432;186884128;187038240;186713184;185543328;188297536;186510368;89462304;187059232;188584960;185451680;187903584;187536960;185323392;187967040;7697024;188292416 | 45436320;44700160;44598528;44848192;44477536;44758592;44349856;44186304;45860512;44304448;45301568;44330816;44254400;44065568;45392000;43906272;44863680;44356832;2996096;45726752 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 1547.00 | 1409111381.333 | 84686016.00 | 123189762.67 | 47.70 | 6.78 | 910.87 | true | 0.476120;0.476105;0.476673;0.478117;0.476650;0.476054;0.478664;0.479418;0.477951;0.476315;0.478868;0.476286;0.476468;0.475963;0.476578;0.476520;0.476457;0.475101;0.477859;0.479699 | 1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488 | 81285664;80692288;86685120;86980768;79827936;78931072;87666240;86674592;84192000;82394912;88564288;88229024;79270080;79194528;86661024;90477248;82691456;82079104;98471296;91054112 | 139497952;144166112;113198496;111641632;144641728;140733312;109076896;110241824;117703104;140069760;111330528;111324608;139609152;141856896;108066912;112150816;115849664;139330176;123351808;114544064 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 1545.36 | 1409111381.333 | 84686016.00 | 123189762.67 | 47.70 | 6.78 | 911.83 | true | 0.476120;0.476105;0.476673;0.478117;0.476650;0.476054;0.478664;0.479418;0.477951;0.476315;0.478868;0.476286;0.476468;0.475963;0.476578;0.476520;0.476457;0.475101;0.477859;0.479699 | 1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488 | 81285664;80692288;86685120;86980768;79827936;78931072;87666240;86674592;84192000;82394912;88564288;88229024;79270080;79194528;86661024;90477248;82691456;82079104;98471296;91054112 | 139497952;144166112;113198496;111641632;144641728;140733312;109076896;110241824;117703104;140069760;111330528;111324608;139609152;141856896;108066912;112150816;115849664;139330176;123351808;114544064 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 1530.91 | 1409111381.333 | 84686016.00 | 123189762.67 | 47.70 | 6.78 | 920.44 | true | 0.476120;0.476105;0.476673;0.478117;0.476650;0.476054;0.478664;0.479418;0.477951;0.476315;0.478868;0.476286;0.476468;0.475963;0.476578;0.476520;0.476457;0.475101;0.477859;0.479699 | 1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488 | 81285664;80692288;86685120;86980768;79827936;78931072;87666240;86674592;84192000;82394912;88564288;88229024;79270080;79194528;86661024;90477248;82691456;82079104;98471296;91054112 | 139497952;144166112;113198496;111641632;144641728;140733312;109076896;110241824;117703104;140069760;111330528;111324608;139609152;141856896;108066912;112150816;115849664;139330176;123351808;114544064 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 1530.09 | 1409111381.333 | 84686016.00 | 123189762.67 | 47.70 | 6.78 | 920.93 | true | 0.476120;0.476105;0.476673;0.478117;0.476650;0.476054;0.478664;0.479418;0.477951;0.476315;0.478868;0.476286;0.476468;0.475963;0.476578;0.476520;0.476457;0.475101;0.477859;0.479699 | 1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488;1461977088;1363935232;1454112768;1357119488 | 81285664;80692288;86685120;86980768;79827936;78931072;87666240;86674592;84192000;82394912;88564288;88229024;79270080;79194528;86661024;90477248;82691456;82079104;98471296;91054112 | 139497952;144166112;113198496;111641632;144641728;140733312;109076896;110241824;117703104;140069760;111330528;111324608;139609152;141856896;108066912;112150816;115849664;139330176;123351808;114544064 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 407.27 | 739770368 | 144102656.00 | 136270893.33 | 48.30 | 2.64 | 1816.40 | true | 0.483098;0.484560;0.483287;0.482793;0.484129;0.483850;0.482547;0.482338;0.484374;0.483434;0.483038;0.482470;0.484401;0.483916;0.482875;0.483323;0.485075;0.483546;0.482229;0.483575 | 739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368 | 138774880;145898880;142853568;140757248;146357984;148460096;142855776;140757184;143827296;151021920;142857536;140757088;148884896;153592640;142861152;140756192;153937504;156155488;142858112;140759328 | 124879520;125420512;140862240;140863552;129234240;127676544;140789184;140862240;127049120;127606880;140824032;140831648;131446496;132005504;140718080;140719104;133849824;136404480;140788416;140794816 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 406.64 | 739770368 | 144102656.00 | 136270893.33 | 48.30 | 2.64 | 1819.24 | true | 0.483098;0.484560;0.483287;0.482793;0.484129;0.483850;0.482547;0.482338;0.484374;0.483434;0.483038;0.482470;0.484401;0.483916;0.482875;0.483323;0.485075;0.483546;0.482229;0.483575 | 739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368 | 138774880;145898880;142853568;140757248;146357984;148460096;142855776;140757184;143827296;151021920;142857536;140757088;148884896;153592640;142861152;140756192;153937504;156155488;142858112;140759328 | 124879520;125420512;140862240;140863552;129234240;127676544;140789184;140862240;127049120;127606880;140824032;140831648;131446496;132005504;140718080;140719104;133849824;136404480;140788416;140794816 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 404.91 | 739770368 | 144102656.00 | 136270893.33 | 48.30 | 2.64 | 1827.00 | true | 0.483098;0.484560;0.483287;0.482793;0.484129;0.483850;0.482547;0.482338;0.484374;0.483434;0.483038;0.482470;0.484401;0.483916;0.482875;0.483323;0.485075;0.483546;0.482229;0.483575 | 739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368 | 138774880;145898880;142853568;140757248;146357984;148460096;142855776;140757184;143827296;151021920;142857536;140757088;148884896;153592640;142861152;140756192;153937504;156155488;142858112;140759328 | 124879520;125420512;140862240;140863552;129234240;127676544;140789184;140862240;127049120;127606880;140824032;140831648;131446496;132005504;140718080;140719104;133849824;136404480;140788416;140794816 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 404.73 | 739770368 | 144102656.00 | 136270893.33 | 48.30 | 2.64 | 1827.83 | true | 0.483098;0.484560;0.483287;0.482793;0.484129;0.483850;0.482547;0.482338;0.484374;0.483434;0.483038;0.482470;0.484401;0.483916;0.482875;0.483323;0.485075;0.483546;0.482229;0.483575 | 739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368;739770368 | 138774880;145898880;142853568;140757248;146357984;148460096;142855776;140757184;143827296;151021920;142857536;140757088;148884896;153592640;142861152;140756192;153937504;156155488;142858112;140759328 | 124879520;125420512;140862240;140863552;129234240;127676544;140789184;140862240;127049120;127606880;140824032;140831648;131446496;132005504;140718080;140719104;133849824;136404480;140788416;140794816 | |
10 | vgg4_conv4_fwd | Convolution | [256,128,56,56] | 26563332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 210.67 | 739770368 | 1220085.33 | 140912576.00 | 47.20 | 5.20 | 3511.56 | true | 0.472601;0.472184;0.472655;0.472583;0.472186 | 739770368;739770368;739770368;739770368;739770368 | 1218464;1221664;1219488;1221344;1219424 | 140889280;140940800;140934080;140911776;140891872 | |
11 | vgg4_relu4_fwd | Activation | [256,256,56,56] | 42553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2261.00 | 411041792 | 139006869.33 | 556850528.00 | 99.90 | 0.59 | 181.80 | true | 0.999002;0.998985;0.998973;0.998988;0.998986 | 411041792;411041792;411041792;411041792;411041792 | 138320576;9048448;139421408;142237280;139278624 | 559747808;18927200;552675072;561291392;558128704 | |
12 | vgg4_conv5_fwd | Convolution | [256,256,56,56] | 53268793.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 41618.67 | 502293069824 | 41908970.67 | 43224384.00 | 12.40 | 5900.07 | 12068.94 | false | 0.123192;0.123823;0.124231;0.124347;0.125533 | 502293069824;502293069824;502293069824;502293069824;502293069824 | 49311968;41244512;39409792;40536672;43945728 | 43239200;43204384;43200832;43263360;43229568 | |
12 | vgg4_conv5_fwd | Convolution | [256,256,56,56] | 53268793.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2345.67 | 1027604480 | 271581258.67 | 78471680.00 | 15.10 | 2.94 | 438.09 | true | 0.151145;0.151043;0.150023;0.151558;0.152306 | 1027604480;1027604480;1027604480;1027604480;1027604480 | 287437344;299742816;212735680;227563616;310053408 | 77100832;77062208;78215712;115518848;80098496 | |
12 | vgg4_conv5_fwd | Convolution | [256,256,56,56] | 53268793.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 3801088 | 2361941.33 | 3415893.33 | 32.30 | 0.66 | 335.40 | true | 0.320115;0.324185;0.324516;0.327140;0.318484 | 3801088;3801088;3801088;3801088;3801088 | 2361600;2361792;2362112;2361920;2362112 | 3416704;3414272;3438336;3401696;3416704 | |
13 | vgg4_relu5_fwd | Activation | [256,256,56,56] | 44114.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2252.00 | 411041792 | 141063157.33 | 563724000.00 | 99.90 | 0.58 | 182.52 | true | 0.998930;0.998988;0.999042;0.999012;0.998998 | 411041792;411041792;411041792;411041792;411041792 | 140565824;144715904;137767488;142053504;140570144 | 562675584;567564896;560931520;559361632;569905216 | |
14 | vgg4_conv6_fwd | Convolution | [256,256,56,56] | 53168727 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 41631.00 | 502293069824 | 44191797.33 | 43916874.67 | 12.40 | 5700.84 | 12065.36 | false | 0.123400;0.124698;0.123789;0.124362;0.121171 | 502293069824;502293069824;502293069824;502293069824;502293069824 | 44625952;43106080;47967200;43026272;44018592 | 44250176;42555040;50317824;42390368;45770176 | |
14 | vgg4_conv6_fwd | Convolution | [256,256,56,56] | 53168727 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2375.33 | 1027604480 | 268705685.33 | 76552320.00 | 15.90 | 2.98 | 432.61 | true | 0.172644;0.151259;0.150719;0.151924;0.173126 | 1027604480;1027604480;1027604480;1027604480;1027604480 | 463455232;251681632;263035392;285866624;257215040 | 120848896;76088320;76812960;76071680;76755680 | |
14 | vgg4_conv6_fwd | Convolution | [256,256,56,56] | 53168727 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359530.67 | 3524053.33 | 30.60 | 0.65 | 345.55 | true | 0.305555;0.305580;0.308398;0.306016;0.305564 | 3801088;3801088;3801088;3801088;3801088 | 2359488;2359488;2359552;2359552;2359616 | 3494848;3528480;3504992;3538688;3547392 | |
15 | vgg4_relu6_fwd | Activation | [256,256,56,56] | 42469.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2271.33 | 411041792 | 140707626.67 | 562312202.67 | 99.90 | 0.58 | 180.97 | true | 0.998985;0.999000;0.999036;0.998986;0.998987 | 411041792;411041792;411041792;411041792;411041792 | 142227392;138529952;141089696;139547936;141485248 | 562831104;566922304;560377312;557268480;563728192 | |
16 | vgg4_pool2_fwd | Pooling | [256,256,56,56] | 413574.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 1284.67 | 51380224 | 218924458.67 | 110152821.33 | 63.50 | 0.16 | 39.99 | true | 0.634690;0.634746;0.634672;0.634619;0.634671 | 51380224;51380224;51380224;51380224;51380224 | 220097728;221081664;215593984;221454688;213164992 | 110620480;111034592;108803392;111510752;107515712 | |
17 | vgg4_conv7_fwd | Convolution | [256,256,28,28] | 25955458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 23744.67 | 287024611328 | 55341941.33 | 52736522.67 | 12.30 | 2655.71 | 12087.96 | false | 0.124326;0.121630;0.123659;0.120597;0.124669 | 287024611328;287024611328;287024611328;287024611328;287024611328 | 54063424;55391264;56571136;56888960;53054080 | 52798944;94707008;52717344;52645120;52693280 | |
17 | vgg4_conv7_fwd | Convolution | [256,256,28,28] | 25955458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1136.00 | 513802240 | 254874645.33 | 95646602.67 | 15.10 | 1.47 | 452.29 | true | 0.151079;0.151390;0.176052;0.151682;0.151397 | 513802240;513802240;513802240;513802240;513802240 | 325037760;243089024;278468416;243066496;231323072 | 96279200;95056736;95603872;95026016;98299488 | |
17 | vgg4_conv7_fwd | Convolution | [256,256,28,28] | 25955458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 17.33 | 7602176 | 4729066.67 | 6398261.33 | 47.90 | 0.68 | 438.60 | true | 0.476389;0.476275;0.479037;0.480793;0.489473 | 7602176;7602176;7602176;7602176;7602176 | 4729024;4728128;4730048;4724544;4734016 | 6356384;6424960;6413440;6332096;6480032 | |
18 | vgg4_relu7_fwd | Activation | [256,512,28,28] | 21502 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1096.00 | 205520896 | 66229376.00 | 133130293.33 | 99.20 | 1.03 | 187.52 | true | 0.991381;0.991530;0.991241;0.991772;0.991638 | 205520896;205520896;205520896;205520896;205520896 | 65721280;66133504;66833344;62690016;67019648 | 132127136;132940192;134323552;126079232;134721344 | |
19 | vgg4_conv8_fwd | Convolution | [256,512,28,28] | 51967918.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 73531.00 | 947245809664 | 16780672.00 | 8386912.00 | 12.30 | 37637.53 | 12882.26 | false | 0.124123;0.121220;0.124419;0.121214;0.124628 | 947245809664;947245809664;947245809664;947245809664;947245809664 | 8386752;8386944;8387040;8389536;8386624 | 16694560;16612672;16961376;16881632;16765824 | |
19 | vgg4_conv8_fwd | Convolution | [256,512,28,28] | 51967918.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1133.00 | 513802240 | 94421610.67 | 21194197.33 | 15.20 | 4.44 | 453.49 | true | 0.152067;0.151346;0.172417;0.152277;0.151630 | 513802240;513802240;513802240;513802240;513802240 | 70863200;69269568;141635200;70766432;141636416 | 15728640;15727872;24139136;23714816;24990976 | |
19 | vgg4_conv8_fwd | Convolution | [256,512,28,28] | 51967918.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 2400.00 | 9472.00 | 5.90 | 0.00 | 0.00 | true | 0.058699;0.058998;0.058806;0.059275;0.059087 | 0;0;0;0;0 | 2400;2400;2400;2400;2400 | 9472;9472;9472;9472;9472 | |
20 | vgg4_relu8_fwd | Activation | [256,512,28,28] | 23195.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1095.00 | 205520896 | 66590368.00 | 133851594.67 | 99.10 | 1.03 | 187.69 | true | 0.991287;0.991578;0.992007;0.991583;0.991282 | 205520896;205520896;205520896;205520896;205520896 | 66787360;65834464;61110176;67333920;67149280 | 134251392;132347872;122937440;135344992;134955520 | |
21 | vgg4_conv9_fwd | Convolution | [256,512,28,28] | 51983484.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 73380.00 | 947245809664 | 16853130.67 | 8369632.00 | 12.30 | 37555.20 | 12908.77 | false | 0.121410;0.124115;0.124142;0.124401;0.121013 | 947245809664;947245809664;947245809664;947245809664;947245809664 | 16729728;16541760;17072864;16991360;16838304 | 8389376;8379456;8363072;8365600;8363840 | |
21 | vgg4_conv9_fwd | Convolution | [256,512,28,28] | 51983484.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1135.00 | 513802240 | 70284213.33 | 15728917.33 | 15.10 | 5.97 | 452.69 | true | 0.151692;0.148057;0.151080;0.148983;0.151916 | 513802240;513802240;513802240;513802240;513802240 | 15728928;18745792;15728928;15727904;15728896 | 71428352;70852000;69590208;69785056;70215584 | |
21 | vgg4_conv9_fwd | Convolution | [256,512,28,28] | 51983484.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 13909.33 | 5.90 | 0.00 | 0.00 | true | 0.059082;0.059107;0.059097;0.059158;0.059080 | 0;0;0;0;0 | 96;96;96;2656;96 | 7168;7168;31488;16640;17920 | |
22 | vgg4_relu9_fwd | Activation | [256,512,28,28] | 21384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1095.00 | 205520896 | 64110112.00 | 128897546.67 | 99.10 | 1.06 | 187.69 | true | 0.991345;0.991630;0.991325;0.991614;0.991399 | 205520896;205520896;205520896;205520896;205520896 | 65897920;63771520;59254176;63770912;64787904 | 132454976;128210368;119197312;128217920;130264352 | |
23 | vgg4_pool3_fwd | Pooling | [256,512,28,28] | 202463.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 733.67 | 25690112 | 73888.00 | 147850.67 | 63.30 | 115.86 | 35.02 | false | 0.633132;0.633119;0.633337;0.633699;0.633233 | 25690112;25690112;25690112;25690112;25690112 | 325728;2050720;2304;115520;2176 | 212256;3185344;640;8768;640 | |
24 | vgg4_conv10_fwd | Convolution | [256,512,14,14] | 13363918.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 5982.67 | 77384908800 | 66055370.67 | 89173002.67 | 12.50 | 498.52 | 12934.85 | false | 0.124296;0.124720;0.124458;0.125776;0.124554 | 77384908800;77384908800;77384908800;77384908800;77384908800 | 64396448;65471584;68298080;64049408;71442304 | 92964224;87779200;87958240;87799424;91761344 | |
24 | vgg4_conv10_fwd | Convolution | [256,512,14,14] | 13363918.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 551.00 | 503316480 | 29633525.33 | 84344448.00 | 47.60 | 4.42 | 913.46 | true | 0.475798;0.475293;0.475423;0.475700;0.475456 | 503316480;503316480;503316480;503316480;503316480 | 29082272;53344224;25878240;25859456;33940064 | 84341792;154629856;75000896;74974528;93690656 | |
24 | vgg4_conv10_fwd | Convolution | [256,512,14,14] | 13363918.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 522.67 | 538968064 | 355823424.00 | 152718613.33 | 48.50 | 1.06 | 1031.19 | true | 0.485094;0.484913;0.485162;0.485034;0.484801 | 538968064;538968064;538968064;538968064;538968064 | 177141504;156890592;124123744;116505248;208024256 | 351993696;358543488;382216032;356933088;349633600 | |
24 | vgg4_conv10_fwd | Convolution | [256,512,14,14] | 13363918.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 287.00 | 128450560 | 102583530.67 | 102754698.67 | 95.50 | 0.63 | 447.56 | true | 0.955424;0.954018;0.956181;0.956453;0.954734 | 128450560;128450560;128450560;128450560;128450560 | 102572512;102586720;102579008;102584864;102593472 | 102755360;102755680;102746816;102753056;102769248 | |
24 | vgg4_conv10_fwd | Convolution | [256,512,14,14] | 13363918.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 65.00 | 21233664 | 9443072.00 | 37585856.00 | 75.90 | 0.45 | 326.67 | true | 0.754713;0.761906;0.759954;0.760456;0.756265 | 21233664;21233664;21233664;21233664;21233664 | 9443072;9443072;9443072;9443072;9443072 | 37543968;37600448;37566976;37590144;37661600 | |
25 | vgg4_relu10_fwd | Activation | [256,512,14,14] | 5487.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.00 | 51380224 | 102761056.00 | 102759701.33 | 97.50 | 0.25 | 191.00 | true | 0.975036;0.974163;0.975231;0.975138;0.975186 | 51380224;51380224;51380224;51380224;51380224 | 102758720;102759872;102760768;102759104;102760128 | 102761056;102761056;102761056;102761056;102761056 | |
26 | vgg4_conv11_fwd | Convolution | [256,512,14,14] | 13323405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 5983.67 | 77384908800 | 121502090.67 | 128420149.33 | 12.40 | 309.64 | 12932.69 | false | 0.124319;0.124295;0.124247;0.124820;0.124234 | 77384908800;77384908800;77384908800;77384908800;77384908800 | 65343840;66896512;232423168;219766304;77843456 | 87534336;87739104;168554816;162845568;134675776 | |
26 | vgg4_conv11_fwd | Convolution | [256,512,14,14] | 13323405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 555.00 | 503316480 | 31751978.67 | 94242634.67 | 47.50 | 3.99 | 906.88 | true | 0.475943;0.475553;0.475079;0.475543;0.475292 | 503316480;503316480;503316480;503316480;503316480 | 29055072;24228288;41972576;24217408;46829536 | 84812960;65981184;122515008;75399936;136672960 | |
26 | vgg4_conv11_fwd | Convolution | [256,512,14,14] | 13323405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 513.00 | 538968064 | 299620320.00 | 126731221.33 | 48.50 | 1.26 | 1050.62 | true | 0.484817;0.484843;0.484849;0.484495;0.484593 | 538968064;538968064;538968064;538968064;538968064 | 335347456;352680704;286498496;186009472;277015008 | 157939008;150648960;116476032;47012704;113068672 | |
26 | vgg4_conv11_fwd | Convolution | [256,512,14,14] | 13323405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 288.67 | 128450560 | 102574058.67 | 102753077.33 | 95.50 | 0.63 | 444.98 | true | 0.954463;0.955636;0.955118;0.954695;0.955215 | 128450560;128450560;128450560;128450560;128450560 | 102567488;102570880;102579136;102572160;102589792 | 102743264;102743968;102756064;102759200;102785376 | |
26 | vgg4_conv11_fwd | Convolution | [256,512,14,14] | 13323405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 67.00 | 21233664 | 9437440.00 | 37583018.67 | 76.90 | 0.45 | 316.92 | true | 0.769108;0.774283;0.763797;0.762175;0.773390 | 21233664;21233664;21233664;21233664;21233664 | 9437440;9437440;9437440;9437440;9437440 | 37700192;37531520;37628544;37521792;37588992 | |
27 | vgg4_relu11_fwd | Activation | [256,512,14,14] | 5513.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102758634.67 | 97.50 | 0.25 | 190.53 | true | 0.975240;0.975081;0.975037;0.974819;0.975012 | 51380224;51380224;51380224;51380224;51380224 | 102762592;102760800;102760800;102760800;102760800 | 102753472;102759168;102758208;102758848;102758848 | |
28 | vgg4_conv12_fwd | Convolution | [256,512,14,14] | 13326295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 5984.67 | 77384908800 | 65635349.33 | 87804085.33 | 12.40 | 504.34 | 12930.53 | false | 0.124093;0.124496;0.124081;0.124231;0.124939 | 77384908800;77384908800;77384908800;77384908800;77384908800 | 66076192;63300896;67528960;63143232;70766976 | 87817248;87820192;87831808;87725440;87774816 | |
28 | vgg4_conv12_fwd | Convolution | [256,512,14,14] | 13326295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 551.00 | 503316480 | 30695168.00 | 86395946.67 | 47.50 | 4.30 | 913.46 | true | 0.475884;0.475137;0.475607;0.475111;0.475354 | 503316480;503316480;503316480;503316480;503316480 | 35516512;25834112;27462880;30688000;33934624 | 103677472;75394048;75396544;84836608;98954688 | |
28 | vgg4_conv12_fwd | Convolution | [256,512,14,14] | 13326295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 520.00 | 538968064 | 354650090.67 | 161774197.33 | 48.50 | 1.04 | 1036.48 | true | 0.484934;0.484584;0.485077;0.484675;0.484688 | 538968064;538968064;538968064;538968064;538968064 | 354312384;249673920;362703200;346934688;379720192 | 156268864;92644608;178993088;165358848;163694880 | |
28 | vgg4_conv12_fwd | Convolution | [256,512,14,14] | 13326295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 290.00 | 128450560 | 102584405.33 | 102734624.00 | 95.60 | 0.63 | 442.93 | true | 0.955301;0.957453;0.955933;0.955415;0.955976 | 128450560;128450560;128450560;128450560;128450560 | 102585696;102582432;102587520;102584640;102582880 | 102763296;102727104;102713472;102694176;102786400 | |
28 | vgg4_conv12_fwd | Convolution | [256,512,14,14] | 13326295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 66.00 | 21233664 | 9438464.00 | 37603317.33 | 77.10 | 0.45 | 321.72 | true | 0.775144;0.766912;0.769851;0.772850;0.771553 | 21233664;21233664;21233664;21233664;21233664 | 9437440;9437440;9444352;9437440;9440512 | 37655552;37589760;37594944;37562304;37625248 | |
29 | vgg4_relu12_fwd | Activation | [256,512,14,14] | 5507.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102759861.33 | 97.50 | 0.25 | 190.30 | true | 0.974510;0.974516;0.974593;0.975134;0.974886 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102759488;102758560;102771008;102760640;102759456 | |
30 | vgg4_pool4_fwd | Pooling | [256,512,14,14] | 62598.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 184.67 | 6422528 | 102764533.33 | 31436469.33 | 63.10 | 0.05 | 34.78 | true | 0.631195;0.631028;0.631408;0.630247;0.629920 | 6422528;6422528;6422528;6422528;6422528 | 31499680;31518304;31042272;31370368;31439360 | 102763360;102761600;102802080;102766592;102763648 | |
31 | vgg4_dense0_fwd | FullyConnected | [256,512,7,7] | 2868556.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_tn | 5078.00 | 52709818368 | 66727701.33 | 10559797.33 | 13.10 | 682.00 | 10380.04 | false | 0.130290;0.131451;0.129232;0.132495;0.132207 | 52709818368;52709818368;52709818368;52709818368;52709818368 | 59772128;64855680;63622368;77987680;71705056 | 10282432;10631616;10762304;10606752;10441024 | |
31 | vgg4_dense0_fwd | FullyConnected | [256,512,7,7] | 2868556.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 11.67 | 1048576 | 451054858.67 | 19115669.33 | 71.50 | 0.00 | 89.88 | true | 0.718264;0.714329;0.713798;0.714101;0.715895 | 1048576;1048576;1048576;1048576;1048576 | 442944096;2029632;471523296;486314752;438697184 | 19203200;2375616;19023200;20236608;19120608 | |
32 | vgg4_dense0_relu_fwd | Activation | [256,4096] | 268 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 2097152 | 544.00 | 3906752.00 | 79.00 | 0.54 | 299.59 | true | 0.790258;0.789928;0.790736;0.790615;0.787514 | 2097152;2097152;2097152;2097152;2097152 | 544;544;544;544;544 | 3913376;3900896;3910080;3909280;3898080 | |
33 | vgg4_dropout0_fwd | Dropout | [256,4096] | 175.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::identity, 1>, float*, float*>(int, float*, float*) | 10.00 | 0 | 1280.00 | 3912640.00 | 31.80 | 0.00 | 0.00 | true | 0.318435;0.316070;0.318427;0.321635;0.317223 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 3913728;3887424;3931008;3907008;3917184 | |
34 | vgg4_dense1_fwd | FullyConnected | [256,4096] | 465319.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x128_tn | 690.67 | 8637120512 | 565.33 | 2858.67 | 24.40 | 2522523.51 | 12505.48 | false | 0.243358;0.245906;0.244024;0.240768;0.244357 | 8637120512;8637120512;8637120512;8637120512;8637120512 | 576;512;512;608;2112 | 2816;2816;2816;2944;4480 | |
34 | vgg4_dense1_fwd | FullyConnected | [256,4096] | 465319.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 11.00 | 1048576 | 1547797.33 | 3015968.00 | 67.00 | 0.23 | 95.33 | true | 0.669426;0.664809;0.666863;0.694213;0.672251 | 1048576;1048576;1048576;1048576;1048576 | 1210176;1763648;1770688;1331136;1548608 | 3089920;2975360;2993600;2938624;3078944 | |
35 | vgg4_dense1_relu_fwd | Activation | [256,4096] | 274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.33 | 2097152 | 288.00 | 3875872.00 | 79.00 | 0.54 | 285.99 | true | 0.790319;0.791258;0.789228;0.790230;0.789050 | 2097152;2097152;2097152;2097152;2097152 | 288;288;288;288;288 | 3869088;3891456;3873440;3862880;3885088 | |
36 | vgg4_dropout1_fwd | Dropout | [256,4096] | 176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::identity, 1>, float*, float*>(int, float*, float*) | 10.00 | 0 | 0.00 | 3959274.67 | 29.30 | 0.00 | 0.00 | true | 0.292934;0.293739;0.295077;0.289894;0.293142 | 0;0;0;0;0 | 0;0;0;0;0 | 3952448;4020352;3967520;3949248;3957856 | |
37 | vgg4_dense2_fwd | FullyConnected | [256,4096] | 115355.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_tn | 192.67 | 2159280128 | 19857258.67 | 3142208.00 | 12.10 | 93.88 | 11207.32 | false | 0.121103;0.121074;0.121228;0.121078;0.121083 | 2159280128;2159280128;2159280128;2159280128;2159280128 | 19837888;19876288;19862336;19847360;19862080 | 3143520;3143776;3139520;3133184;3143584 | |
37 | vgg4_dense2_fwd | FullyConnected | [256,4096] | 115355.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 5.00 | 256000 | 5930.67 | 0.00 | 54.60 | 43.17 | 51.20 | false | 0.545342;0.541527;0.550167;0.546753;0.544490 | 256000;256000;256000;256000;256000 | 4224;4224;9344;4224;9344 | 0;0;0;0;0 |
Showing 1 to 84 of 84 entries