GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | densenet0_conv0_fwd | Convolution | [32,3,224,224] | 520353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 670.67 | 7861174272 | 21.33 | 5205.33 | 24.30 | 1504051.39 | 11721.43 | false | 0.242764;0.242140;0.242521;0.242689;0.242465 | 7861174272;7861174272;7861174272;7861174272;7861174272 | 0;64;0;0;128 | 5120;4992;5120;27776;5376 | |
0 | densenet0_conv0_fwd | Convolution | [32,3,224,224] | 520353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 618.67 | 240085.33 | 7.50 | 0.00 | 0.00 | true | 0.074943;0.074649;0.074675;0.074731;0.074356 | 0;0;0;0;0 | 263104;248896;236032;235328;232768 | 3424;608;640;352;608 | |
1 | densenet0_batchnorm0_fwd | BatchNorm | [32,64,112,112] | 8265.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 272.00 | 155189248 | 102769376.00 | 102933546.67 | 92.70 | 0.75 | 570.55 | true | 0.927130;0.926653;0.927629;0.926525;0.927090 | 155189248;155189248;155189248;155189248;155189248 | 102768928;102774432;102769952;102769248;102768544 | 102932256;102940320;102933184;102912576;102935200 | |
2 | densenet0_relu0_fwd | Activation | [32,64,112,112] | 8444.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 271.33 | 51380224 | 102761482.67 | 102576704.00 | 97.50 | 0.25 | 189.36 | true | 0.974617;0.975412;0.975688;0.975551;0.974112 | 51380224;51380224;51380224;51380224;51380224 | 102761312;102761568;102761568;102761312;102766176 | 102574272;102575552;102578112;102576448;102590016 | |
3 | densenet0_pool0_fwd | Pooling | [32,64,112,112] | 81128 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 216.33 | 6422528 | 102768384.00 | 32166229.33 | 72.00 | 0.05 | 29.69 | true | 0.720137;0.719914;0.719584;0.721086;0.719950 | 6422528;6422528;6422528;6422528;6422528 | 102768672;102768384;102768448;102768320;102768320 | 32057696;32240160;32259968;32067680;32190848 | |
4 | densenet0_stage1_batchnorm0_fwd | BatchNorm | [32,64,56,56] | 4206.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 68.00 | 39583744 | 25692810.67 | 24100234.67 | 85.70 | 0.79 | 582.11 | true | 0.858060;0.857095;0.856201;0.856061;0.857611 | 39583744;39583744;39583744;39583744;39583744 | 25692768;25692768;25692896;25692768;25693024 | 24189952;24012768;24065056;24191808;24045696 | |
5 | densenet0_stage1_relu0_fwd | Activation | [32,64,56,56] | 2132.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.00 | 12845056 | 25690464.00 | 25618965.33 | 94.40 | 0.25 | 186.16 | true | 0.942310;0.943981;0.944605;0.944419;0.945803 | 12845056;12845056;12845056;12845056;12845056 | 25617216;25614784;25620928;25622208;25618752 | 25690464;25690464;25690464;25690464;25690464 | |
6 | densenet0_stage1_conv0_fwd | Convolution | [32,64,56,56] | 111351.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 166.33 | 1669857280 | 51472064.00 | 51316714.67 | 23.70 | 16.25 | 10039.24 | true | 0.237222;0.236811;0.237379;0.238009;0.237325 | 1669857280;1669857280;1669857280;1669857280;1669857280 | 51321952;51284352;51313088;51321536;51315520 | 51474752;51469376;51468096;51472064;51477824 | |
6 | densenet0_stage1_conv0_fwd | Convolution | [32,64,56,56] | 111351.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 181.33 | 26240.00 | 6.10 | 0.00 | 0.00 | true | 0.061198;0.061167;0.061202;0.061261;0.061190 | 0;0;0;0;0 | 352;96;96;352;96 | 26496;32896;25984;26240;25984 | |
7 | densenet0_stage1_batchnorm1_fwd | BatchNorm | [32,128,56,56] | 4771 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51390197.33 | 51516618.67 | 86.90 | 0.77 | 586.43 | true | 0.869684;0.869169;0.869174;0.870436;0.868934 | 79167488;79167488;79167488;79167488;79167488 | 51508128;51526176;51527840;51506848;51515552 | 51390496;51390176;51389920;51389024;51391904 | |
8 | densenet0_stage1_relu1_fwd | Activation | [32,128,56,56] | 4246 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380746.67 | 51306592.00 | 95.80 | 0.25 | 188.44 | true | 0.958304;0.958273;0.956609;0.957977;0.958138 | 25690112;25690112;25690112;25690112;25690112 | 51305504;51324992;51295808;51308736;51305536 | 51380576;51380832;51381088;51380576;51380832 | |
9 | densenet0_stage1_conv1_fwd | Convolution | [32,128,56,56] | 604882.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 378.33 | 3943432192 | 144629258.67 | 15308554.67 | 24.30 | 24.66 | 10423.18 | false | 0.242643;0.241186;0.243251;0.242622;0.243526 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 145791712;142251168;145844896;141843936;146207776 | 15294624;15228800;15313408;15325664;15317632 | |
9 | densenet0_stage1_conv1_fwd | Convolution | [32,128,56,56] | 604882.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.33 | 237568 | 149269.33 | 263669.33 | 6.20 | 0.58 | 44.55 | true | 0.062412;0.062422;0.062422;0.062421;0.062421 | 237568;237568;237568;237568;237568 | 149184;149184;149440;149184;149440 | 263328;250272;264352;263840;263840 | |
10 | densenet0_stage1_concat0 | Concat | [32,64,56,56] | 3700.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 56.43 | 0 | 19268576.00 | 18084816.00 | 85.40 | 0.00 | 0.00 | true | 0.859220;0.848290;0.860465;0.850031;0.859251;0.847834;0.859242;0.848175;0.860761;0.848229 | 0;0;0;0;0;0;0;0;0;0 | 25691872;12845280;25691872;12845280;25691872;12845280;25691872;12845280;25691872;12845280 | 23310304;12884000;23383712;12865952;23289632;12870688;23270816;12883456;23328928;12827808 | |
10 | densenet0_stage1_concat0 | Concat | [32,64,56,56] | 3700.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 51.57 | 0 | 19268576.00 | 18084816.00 | 85.40 | 0.00 | 0.00 | true | 0.859220;0.848290;0.860465;0.850031;0.859251;0.847834;0.859242;0.848175;0.860761;0.848229 | 0;0;0;0;0;0;0;0;0;0 | 25691872;12845280;25691872;12845280;25691872;12845280;25691872;12845280;25691872;12845280 | 23310304;12884000;23383712;12865952;23289632;12870688;23270816;12883456;23328928;12827808 | |
11 | densenet0_stage1_batchnorm2_fwd | BatchNorm | [32,96,56,56] | 4962.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 102.00 | 59375616 | 38540597.33 | 38506826.67 | 86.70 | 0.77 | 582.11 | true | 0.867270;0.865333;0.867293;0.866312;0.866266 | 59375616;59375616;59375616;59375616;59375616 | 38540896;38540384;38540640;38540256;38540768 | 38499872;38498720;38511392;38509216;38516384 | |
12 | densenet0_stage1_relu2_fwd | Activation | [32,96,56,56] | 3191.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 104.00 | 19267584 | 38535520.00 | 38459797.33 | 95.50 | 0.25 | 185.27 | true | 0.955305;0.955606;0.955437;0.953246;0.954336 | 19267584;19267584;19267584;19267584;19267584 | 38458816;38470592;38452288;38462528;38458048 | 38535520;38535520;38535776;38535520;38535520 | |
13 | densenet0_stage1_conv2_fwd | Convolution | [32,96,56,56] | 139199.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 221.33 | 2491940864 | 77177536.00 | 51356853.33 | 23.70 | 19.39 | 11258.79 | false | 0.237034;0.237175;0.236788;0.237174;0.236390 | 2491940864;2491940864;2491940864;2491940864;2491940864 | 77167424;77190336;77177152;77188032;77166400 | 51343360;51365184;51359936;51346944;51363680 | |
13 | densenet0_stage1_conv2_fwd | Convolution | [32,96,56,56] | 139199.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 26026.67 | 6.10 | 0.00 | 0.00 | true | 0.061381;0.061188;0.061175;0.061168;0.061183 | 0;0;0;0;0 | 96;96;96;96;96 | 26112;26112;25984;25984;25984 | |
14 | densenet0_stage1_batchnorm3_fwd | BatchNorm | [32,128,56,56] | 5462.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51390880.00 | 51472981.33 | 86.90 | 0.77 | 586.43 | true | 0.868091;0.869008;0.868538;0.869101;0.868961 | 79167488;79167488;79167488;79167488;79167488 | 51390112;51391008;51396192;51391136;51390496 | 51482016;51469984;51482944;51464864;51466944 | |
15 | densenet0_stage1_relu3_fwd | Activation | [32,128,56,56] | 5028.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.67 | 25690112 | 51380576.00 | 51305024.00 | 95.70 | 0.25 | 186.61 | true | 0.957498;0.955412;0.958776;0.956200;0.958079 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51309248;51300160;51292864;51310144;51305664 | |
16 | densenet0_stage1_conv3_fwd | Convolution | [32,128,56,56] | 603816 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 378.67 | 3943432192 | 144816608.00 | 15203114.67 | 24.30 | 24.64 | 10413.98 | false | 0.242770;0.243056;0.242161;0.242484;0.244344 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 145237792;145423904;145313184;142909536;143898848 | 15218944;15227264;15184608;15205792;15184448 | |
16 | densenet0_stage1_conv3_fwd | Convolution | [32,128,56,56] | 603816 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 347893.33 | 6.20 | 0.48 | 47.51 | true | 0.062358;0.062363;0.062360;0.062363;0.062362 | 237568;237568;237568;237568;237568 | 312864;347680;347808;348192;358048 | 148160;147648;147648;147648;147648 | |
17 | densenet0_stage1_concat1 | Concat | [32,96,56,56] | 5722.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 75.57 | 0 | 25690677.33 | 24501312.00 | 85.90 | 0.00 | 0.00 | true | 0.865845;0.851503;0.865152;0.853321;0.866196;0.852660;0.866198;0.850338;0.866291;0.848702 | 0;0;0;0;0;0;0;0;0;0 | 38546144;12845280;38535392;12845280;38535392;12845280;38535392;12845280;38535392;12847328 | 36108288;12887808;36093920;12886048;36139808;12899360;36119168;12887712;36128800;12899328 | |
17 | densenet0_stage1_concat1 | Concat | [32,96,56,56] | 5722.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 66.00 | 0 | 25690677.33 | 24501312.00 | 85.90 | 0.00 | 0.00 | true | 0.865845;0.851503;0.865152;0.853321;0.866196;0.852660;0.866198;0.850338;0.866291;0.848702 | 0;0;0;0;0;0;0;0;0;0 | 38546144;12845280;38535392;12845280;38535392;12845280;38535392;12845280;38535392;12847328 | 36108288;12887808;36093920;12886048;36139808;12899360;36119168;12887712;36128800;12899328 | |
18 | densenet0_stage1_batchnorm4_fwd | BatchNorm | [32,128,56,56] | 5970 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.67 | 79167488 | 51389386.67 | 51373813.33 | 86.90 | 0.77 | 583.54 | true | 0.870235;0.868986;0.868184;0.869451;0.869209 | 79167488;79167488;79167488;79167488;79167488 | 51390560;51389152;51388512;51388192;51390496 | 51377184;51382688;51370144;51365920;51374112 | |
19 | densenet0_stage1_relu4_fwd | Activation | [32,128,56,56] | 5010.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51381002.67 | 51303402.67 | 95.70 | 0.25 | 187.98 | true | 0.958193;0.956610;0.957012;0.957875;0.956911 | 25690112;25690112;25690112;25690112;25690112 | 51381600;51382624;51380576;51380832;51380576 | 51287488;51309376;51303488;51308480;51298240 | |
20 | densenet0_stage1_conv4_fwd | Convolution | [32,128,56,56] | 213962.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 282.33 | 3314024448 | 102905344.00 | 51355424.00 | 23.80 | 21.48 | 11738.00 | false | 0.238621;0.237539;0.237080;0.238543;0.237778 | 3314024448;3314024448;3314024448;3314024448;3314024448 | 102901312;102918656;102935552;102896064;102892416 | 51350624;51359552;51387072;51336512;51356096 | |
20 | densenet0_stage1_conv4_fwd | Convolution | [32,128,56,56] | 213962.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 26026.67 | 6.10 | 0.00 | 0.00 | true | 0.061211;0.061179;0.061170;0.061168;0.061183 | 0;0;0;0;0 | 96;96;96;14176;96 | 26112;25984;25984;55424;25984 | |
21 | densenet0_stage1_batchnorm5_fwd | BatchNorm | [32,128,56,56] | 5487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51389642.67 | 51464448.00 | 87.00 | 0.77 | 586.43 | true | 0.869682;0.869475;0.870081;0.870438;0.869006 | 79167488;79167488;79167488;79167488;79167488 | 51464480;51461184;51432608;51467680;51473056 | 51389920;51392352;51389280;51389344;51389664 | |
22 | densenet0_stage1_relu5_fwd | Activation | [32,128,56,56] | 4666 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51311893.33 | 95.80 | 0.25 | 188.44 | true | 0.958119;0.956878;0.957014;0.957886;0.958063 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51317312;51308992;51313728;51300672;51312960 | |
23 | densenet0_stage1_conv5_fwd | Convolution | [32,128,56,56] | 602688.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 378.00 | 3943432192 | 144405301.33 | 15241802.67 | 24.30 | 24.70 | 10432.36 | false | 0.241314;0.240783;0.245094;0.242935;0.243366 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 145971616;141547680;144484640;144041632;144689632 | 15278752;15225696;15256288;15227264;15241856 | |
23 | densenet0_stage1_conv5_fwd | Convolution | [32,128,56,56] | 602688.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 293109.33 | 6.20 | 0.54 | 50.90 | true | 0.062371;0.062363;0.062362;0.062370;0.062364 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 292640;293664;293408;293280;278816 | |
24 | densenet0_stage1_concat2 | Concat | [32,128,56,56] | 5940.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 94.29 | 0 | 32112864.00 | 30887637.33 | 86.00 | 0.00 | 0.00 | true | 0.868044;0.849078;0.867725;0.848542;0.867208;0.852549;0.866748;0.851278;0.868007;0.851493 | 0;0;0;0;0;0;0;0;0;0 | 51380448;12845280;51380448;12845280;51380448;12845280;51380448;12845280;51380448;12845280 | 48924992;12815744;48993568;12820480;48946528;12824352;48974784;12830976;48962368;12836608 | |
24 | densenet0_stage1_concat2 | Concat | [32,128,56,56] | 5940.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 80.00 | 0 | 32112864.00 | 30887637.33 | 86.00 | 0.00 | 0.00 | true | 0.868044;0.849078;0.867725;0.848542;0.867208;0.852549;0.866748;0.851278;0.868007;0.851493 | 0;0;0;0;0;0;0;0;0;0 | 51380448;12845280;51380448;12845280;51380448;12845280;51380448;12845280;51380448;12845280 | 48924992;12815744;48993568;12820480;48946528;12824352;48974784;12830976;48962368;12836608 | |
25 | densenet0_stage1_batchnorm6_fwd | BatchNorm | [32,160,56,56] | 6557 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 166.33 | 98959360 | 64244064.00 | 64291274.67 | 87.20 | 0.77 | 594.95 | true | 0.872155;0.870165;0.871247;0.872190;0.871638 | 98959360;98959360;98959360;98959360;98959360 | 64244320;64244768;64243104;64242784;64244832 | 64327360;64283424;64298144;64292256;64278976 | |
26 | densenet0_stage1_relu6_fwd | Activation | [32,160,56,56] | 4404.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 170.33 | 32112640 | 64225632.00 | 64163605.33 | 96.20 | 0.25 | 188.53 | true | 0.962694;0.960340;0.960970;0.961017;0.962678 | 32112640;32112640;32112640;32112640;32112640 | 64144576;64180288;64155328;64166080;64169408 | 64225632;64231008;64225632;64225632;64225632 | |
27 | densenet0_stage1_conv6_fwd | Convolution | [32,160,56,56] | 247651.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 344.00 | 4136108032 | 128695296.00 | 51409642.67 | 23.80 | 22.96 | 12023.57 | false | 0.238686;0.238569;0.237395;0.237860;0.237959 | 4136108032;4136108032;4136108032;4136108032;4136108032 | 128675008;128709696;128637568;128727744;128701184 | 51366400;51437792;51452064;51371040;51420096 | |
27 | densenet0_stage1_conv6_fwd | Convolution | [32,160,56,56] | 247651.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 25984.00 | 6.10 | 0.00 | 0.00 | true | 0.061215;0.061188;0.061180;0.061199;0.061181 | 0;0;0;0;0 | 25984;25984;25984;26112;25984 | 96;96;96;96;96 | |
28 | densenet0_stage1_batchnorm7_fwd | BatchNorm | [32,128,56,56] | 4072 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51390858.67 | 51413877.33 | 86.90 | 0.77 | 586.43 | true | 0.867717;0.869649;0.868978;0.869407;0.869501 | 79167488;79167488;79167488;79167488;79167488 | 51391392;51390688;51390176;51390496;51391392 | 51457184;51376800;51385888;51462944;51398560 | |
29 | densenet0_stage1_relu7_fwd | Activation | [32,128,56,56] | 3578 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51305920.00 | 95.70 | 0.25 | 188.44 | true | 0.956918;0.957030;0.957327;0.956684;0.958111 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51310912;51307712;51292608;51299136;51312064 | |
30 | densenet0_stage1_conv7_fwd | Convolution | [32,128,56,56] | 604255.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 378.33 | 3943432192 | 143515360.00 | 15293056.00 | 24.20 | 24.83 | 10423.18 | false | 0.241255;0.241410;0.241561;0.243412;0.242463 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 145016544;142469472;144930656;142906784;142708640 | 15308480;15273472;15297216;15160000;15314208 | |
30 | densenet0_stage1_conv7_fwd | Convolution | [32,128,56,56] | 604255.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 266058.67 | 6.20 | 0.57 | 47.51 | true | 0.062361;0.062362;0.062360;0.062372;0.062370 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 265376;266912;280992;265888;265376 | |
31 | densenet0_stage1_concat3 | Concat | [32,160,56,56] | 6196.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 113.57 | 0 | 38535392.00 | 37363941.33 | 86.00 | 0.00 | 0.00 | true | 0.868391;0.846287;0.868876;0.849916;0.868692;0.849512;0.868835;0.851712;0.868966;0.849037 | 0;0;0;0;0;0;0;0;0;0 | 64225504;12845280;64225504;12845280;64225504;12845280;64225504;12845280;64225504;12845280 | 61768448;12956288;61805312;12957088;61764480;12953248;61914304;12972832;61761184;12959616 | |
31 | densenet0_stage1_concat3 | Concat | [32,160,56,56] | 6196.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 94.43 | 0 | 38535392.00 | 37363941.33 | 86.00 | 0.00 | 0.00 | true | 0.868391;0.846287;0.868876;0.849916;0.868692;0.849512;0.868835;0.851712;0.868966;0.849037 | 0;0;0;0;0;0;0;0;0;0 | 64225504;12845280;64225504;12845280;64225504;12845280;64225504;12845280;64225504;12845280 | 61768448;12956288;61805312;12957088;61764480;12953248;61914304;12972832;61761184;12959616 | |
32 | densenet0_stage1_batchnorm8_fwd | BatchNorm | [32,192,56,56] | 7639 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 199.00 | 118751232 | 77105141.33 | 77020448.00 | 87.20 | 0.77 | 596.74 | true | 0.872365;0.872380;0.871287;0.872633;0.872002 | 118751232;118751232;118751232;118751232;118751232 | 77105568;77104864;77106016;77104992;77104672 | 77025952;77014688;77028640;77011456;77020704 | |
33 | densenet0_stage1_relu8_fwd | Activation | [32,192,56,56] | 5194 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 204.33 | 38535168 | 77070688.00 | 76995381.33 | 96.70 | 0.25 | 188.59 | true | 0.966593;0.966347;0.966113;0.966989;0.966795 | 38535168;38535168;38535168;38535168;38535168 | 77070688;77070688;77070688;77070688;77070688 | 76993824;77000640;76982208;76994368;76997952 | |
34 | densenet0_stage1_conv8_fwd | Convolution | [32,192,56,56] | 276213.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 408.67 | 4958191616 | 150380565.33 | 50295061.33 | 23.80 | 24.71 | 12132.60 | false | 0.238223;0.237808;0.237933;0.238906;0.238538 | 4958191616;4958191616;4958191616;4958191616;4958191616 | 51355296;49765248;49761664;49764640;51362240 | 154393920;147157984;147139424;149589792;154405440 | |
34 | densenet0_stage1_conv8_fwd | Convolution | [32,192,56,56] | 276213.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 25984.00 | 6.10 | 0.00 | 0.00 | true | 0.061178;0.061233;0.061197;0.061177;0.061211 | 0;0;0;0;0 | 96;96;96;96;96 | 25984;25984;25984;25984;25984 | |
35 | densenet0_stage1_batchnorm9_fwd | BatchNorm | [32,128,56,56] | 4131.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51390837.33 | 51455402.67 | 87.00 | 0.77 | 586.43 | true | 0.869093;0.869478;0.870938;0.870363;0.870455 | 79167488;79167488;79167488;79167488;79167488 | 51390688;51390304;51390112;51402592;51391520 | 51448352;51442336;51450912;51469088;51466944 | |
36 | densenet0_stage1_relu9_fwd | Activation | [32,128,56,56] | 3496 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380661.33 | 51298570.67 | 95.70 | 0.25 | 187.98 | true | 0.956923;0.957215;0.955964;0.956569;0.957292 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380832;51380576;51382624 | 51300032;51306176;51299904;51289920;51295776 | |
37 | densenet0_stage1_conv9_fwd | Convolution | [32,128,56,56] | 604078 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 378.00 | 3943432192 | 145392373.33 | 15285781.33 | 24.30 | 24.54 | 10432.36 | false | 0.243769;0.243356;0.242451;0.241736;0.244125 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 145374112;144957920;145845088;146432928;142063904 | 15241824;15267584;15289184;15300576;15307840 | |
37 | densenet0_stage1_conv9_fwd | Convolution | [32,128,56,56] | 604078 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147648.00 | 276256.00 | 6.20 | 0.56 | 54.83 | true | 0.062369;0.062363;0.062368;0.062363;0.062364 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 280096;276256;276000;276512;262560 | |
38 | densenet0_stage1_concat4 | Concat | [32,192,56,56] | 7287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanLargeKernel<mshadow::sv::saveto, 8, 1024, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int) | 234.67 | 0 | 77083104.00 | 74854933.33 | 82.30 | 0.00 | 0.00 | true | 0.822446;0.820723;0.823525;0.823159;0.822767 | 0;0;0;0;0 | 77083104;77083104;77083104;77103840;77083104 | 74883552;74864352;74854304;74846144;74834720 | |
38 | densenet0_stage1_concat4 | Concat | [32,192,56,56] | 7287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 37.33 | 0 | 12845792.00 | 12627594.67 | 84.80 | 0.00 | 0.00 | true | 0.849526;0.847869;0.847359;0.847853;0.847438 | 0;0;0;0;0 | 12625408;12631584;12629120;12623744;12628256 | 12845792;12845792;12845792;12845792;12845792 | |
39 | densenet0_stage1_batchnorm10_fwd | BatchNorm | [32,224,56,56] | 8679.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 231.33 | 138543104 | 89974474.67 | 89962837.33 | 87.30 | 0.77 | 598.89 | true | 0.873414;0.872734;0.873319;0.872502;0.872878 | 138543104;138543104;138543104;138543104;138543104 | 89974112;89973792;89974432;89974880;89976608 | 89964448;89956640;89964224;89959840;89975744 | |
40 | densenet0_stage1_relu10_fwd | Activation | [32,224,56,56] | 5578 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 236.33 | 44957696 | 89916000.00 | 89842453.33 | 97.00 | 0.25 | 190.23 | true | 0.969530;0.970020;0.969659;0.969927;0.969471 | 44957696;44957696;44957696;44957696;44957696 | 89916000;89916000;89916000;89916000;89916000 | 89842624;89844032;89840704;89846464;89837248 | |
41 | densenet0_stage1_conv10_fwd | Convolution | [32,224,56,56] | 354593.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 488.00 | 5780275200 | 110722624.00 | 31594517.33 | 23.80 | 40.62 | 11844.83 | false | 0.238372;0.239197;0.238498;0.238380;0.238139 | 5780275200;5780275200;5780275200;5780275200;5780275200 | 98532000;118202112;123941184;95730752;115433760 | 28957376;33707840;35406336;27348032;32118336 | |
41 | densenet0_stage1_conv10_fwd | Convolution | [32,224,56,56] | 354593.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 181.33 | 26154.67 | 6.10 | 0.00 | 0.00 | true | 0.061219;0.061187;0.061259;0.061217;0.061175 | 0;0;0;0;0 | 25984;26496;25984;28544;25984 | 96;352;96;1376;96 | |
42 | densenet0_stage1_batchnorm11_fwd | BatchNorm | [32,128,56,56] | 3301.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 79167488 | 51390730.67 | 51376778.67 | 86.80 | 0.77 | 586.43 | true | 0.868774;0.866415;0.868516;0.867938;0.868781 | 79167488;79167488;79167488;79167488;79167488 | 51346752;51449504;51331744;51346592;51436992 | 51390240;51390880;51392032;51390880;51390432 | |
43 | densenet0_stage1_relu11_fwd | Activation | [32,128,56,56] | 2830.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51300245.33 | 95.70 | 0.25 | 188.44 | true | 0.958124;0.956971;0.957746;0.956867;0.957725 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51303744;51295936;51300928;51296064;51310016 | |
44 | densenet0_stage1_conv11_fwd | Convolution | [32,128,56,56] | 602882.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 379.33 | 3943432192 | 144470026.67 | 15189504.00 | 24.30 | 24.70 | 10395.70 | false | 0.242573;0.242238;0.243397;0.242120;0.245978 | 3943432192;3943432192;3943432192;3943432192;3943432192 | 146195424;144764896;144309344;144335840;142565600 | 15194080;15191520;15165696;15206592;15182912 | |
44 | densenet0_stage1_conv11_fwd | Convolution | [32,128,56,56] | 602882.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 381088.00 | 6.20 | 0.45 | 47.51 | true | 0.062363;0.062363;0.062374;0.062363;0.062368 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 381088;380704;381472;381472;368160 | |
45 | densenet0_stage1_concat5 | Concat | [32,224,56,56] | 6693 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanLargeKernel<mshadow::sv::saveto, 8, 1024, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int) | 270.33 | 0 | 89915616.00 | 87734837.33 | 82.10 | 0.00 | 0.00 | true | 0.822396;0.819521;0.822146;0.821110;0.817296 | 0;0;0;0;0 | 89918944;89915616;89915616;89915616;89915616 | 87728864;87720800;87767680;87720288;87754848 | |
45 | densenet0_stage1_concat5 | Concat | [32,224,56,56] | 6693 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 37.33 | 0 | 12845280.00 | 12603616.00 | 84.90 | 0.00 | 0.00 | true | 0.849763;0.848024;0.850314;0.849109;0.849203 | 0;0;0;0;0 | 12608512;12614784;12596992;12605344;12592032 | 12845280;12845280;12845280;12845280;12845280 | |
46 | densenet0_batchnorm1_fwd | BatchNorm | [32,256,56,56] | 8710.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 264.00 | 158334976 | 102846218.67 | 102777184.00 | 87.40 | 0.77 | 599.75 | true | 0.874007;0.873312;0.874306;0.874154;0.873729 | 158334976;158334976;158334976;158334976;158334976 | 102847200;102847840;102845088;102842080;102846368 | 102774432;102759840;102785088;102772032;102787616 | |
47 | densenet0_relu1_fwd | Activation | [32,256,56,56] | 5402.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 273.00 | 51380224 | 102760800.00 | 102700650.67 | 97.50 | 0.25 | 188.21 | true | 0.975797;0.975709;0.974445;0.975269;0.975151 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102701760;102711104;102693440;102706752;102691392 | |
48 | densenet0_conv1_fwd | Convolution | [32,256,56,56] | 387723.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 541.33 | 6602358784 | 69707648.00 | 17656053.33 | 24.00 | 75.57 | 12196.48 | false | 0.240156;0.239975;0.240009;0.240992;0.240080 | 6602358784;6602358784;6602358784;6602358784;6602358784 | 80419200;67552192;70783936;70786816;48285440 | 20066432;17657152;17652608;17658400;12869920 | |
48 | densenet0_conv1_fwd | Convolution | [32,256,56,56] | 387723.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 26240.00 | 6.10 | 0.00 | 0.00 | true | 0.061189;0.061188;0.061223;0.061176;0.061189 | 0;0;0;0;0 | 96;96;6752;96;96 | 26752;25984;40064;25984;25984 | |
49 | densenet0_pool1_fwd | Pooling | [32,128,56,56] | 34163 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 96.00 | 67436544 | 51384917.33 | 14529781.33 | 51.80 | 1.02 | 702.46 | true | 0.517531;0.516921;0.517735;0.517686;0.517380 | 67436544;67436544;67436544;67436544;67436544 | 51384960;51384512;51385280;51384256;51385280 | 14526496;14540704;14530336;14532512;14404896 | |
50 | densenet0_stage2_batchnorm0_fwd | BatchNorm | [32,128,28,28] | 866.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 35.00 | 20316160 | 12847328.00 | 11389344.00 | 80.40 | 0.84 | 580.46 | true | 0.807774;0.801829;0.802927;0.805252;0.804938 | 20316160;20316160;20316160;20316160;20316160 | 12847328;12847328;12847328;12847328;12847328 | 11389984;11392672;11397216;11385376;11381696 | |
51 | densenet0_stage2_relu0_fwd | Activation | [32,128,28,28] | 718.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.33 | 6422528 | 12845664.00 | 12617408.00 | 94.10 | 0.25 | 181.77 | true | 0.938529;0.939838;0.942494;0.941113;0.940656 | 6422528;6422528;6422528;6422528;6422528 | 12845664;12845664;12845664;12845664;12845664 | 12619072;12617024;12609408;12621376;12616128 | |
52 | densenet0_stage2_conv0_fwd | Convolution | [32,128,28,28] | 53357.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 80.67 | 828506112 | 21558464.00 | 13080160.00 | 20.50 | 23.92 | 10270.69 | false | 0.205784;0.205233;0.207443;0.202469;0.204676 | 828506112;828506112;828506112;828506112;828506112 | 21886272;21824896;23214080;20964224;20651072 | 13090240;13077216;13082016;13061184;13081248 | |
52 | densenet0_stage2_conv0_fwd | Convolution | [32,128,28,28] | 53357.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7466.67 | 5.90 | 0.00 | 0.00 | true | 0.059363;0.058188;0.059689;0.059734;0.058587 | 0;0;0;0;0 | 2144;96;96;96;96 | 11264;7424;7552;7424;7424 | |
53 | densenet0_stage2_batchnorm1_fwd | BatchNorm | [32,128,28,28] | 676.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12668576.00 | 12842666.67 | 81.10 | 0.80 | 564.34 | true | 0.811838;0.811966;0.811021;0.810485;0.810195 | 20316160;20316160;20316160;20316160;20316160 | 12582816;12696288;12679840;12657824;12668064 | 12822848;12845184;12835744;12862720;12847072 | |
54 | densenet0_stage2_relu1_fwd | Activation | [32,128,28,28] | 708.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12847114.67 | 12614101.33 | 93.90 | 0.25 | 178.40 | true | 0.939478;0.940526;0.938671;0.935649;0.937764 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12850528;12850528;12845408;12845408 | 12600704;12622080;12632768;12615744;12604480 | |
55 | densenet0_stage2_conv1_fwd | Convolution | [32,128,28,28] | 153165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 93.67 | 985300992 | 29271050.67 | 7610976.00 | 21.10 | 26.71 | 10519.19 | false | 0.210467;0.213700;0.210414;0.209691;0.212030 | 985300992;985300992;985300992;985300992;985300992 | 29276896;29261920;29248864;29274336;29279520 | 7596736;7744832;7789216;7491360;7391840 | |
55 | densenet0_stage2_conv1_fwd | Convolution | [32,128,28,28] | 153165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 64.00 | 48168960 | 12103626.67 | 29202880.00 | 42.80 | 1.17 | 752.64 | true | 0.426370;0.428101;0.428073;0.426672;0.428915 | 48168960;48168960;48168960;48168960;48168960 | 29181536;29179744;29189824;29237280;29251712 | 12111200;12031136;12098336;12215712;12101344 | |
55 | densenet0_stage2_conv1_fwd | Convolution | [32,128,28,28] | 153165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.67 | 16566272 | 5529962.67 | 3235669.33 | 38.80 | 1.89 | 887.46 | true | 0.387974;0.390828;0.387343;0.389254;0.387872 | 16566272;16566272;16566272;16566272;16566272 | 5524800;5582016;5497760;5567328;5345632 | 3310464;3020800;3046816;3349728;3457728 | |
55 | densenet0_stage2_conv1_fwd | Convolution | [32,128,28,28] | 153165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 7.00 | 331776 | 153280.00 | 395594.67 | 12.50 | 0.60 | 47.40 | true | 0.124530;0.124452;0.124690;0.124539;0.124697 | 331776;331776;331776;331776;331776 | 427616;406304;378016;402464;360864 | 167360;153280;153280;153280;153280 | |
56 | densenet0_stage2_concat0 | Concat | [32,128,28,28] | 941.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 27.29 | 0 | 8028768.00 | 8178277.33 | 82.40 | 0.00 | 0.00 | true | 0.846987;0.792232;0.849510;0.778673;0.849492;0.804095;0.846773;0.790643;0.848608;0.808300 | 0;0;0;0;0;0;0;0;0;0 | 12845984;3211488;12845984;3211424;12845984;3211744;12845984;3211424;12845984;3211424 | 13037184;3293696;13178336;3290912;13119264;3288288;13043616;3300896;13088096;3306176 | |
56 | densenet0_stage2_concat0 | Concat | [32,128,28,28] | 941.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 23.71 | 0 | 8028768.00 | 8178277.33 | 82.40 | 0.00 | 0.00 | true | 0.846987;0.792232;0.849510;0.778673;0.849492;0.804095;0.846773;0.790643;0.848608;0.808300 | 0;0;0;0;0;0;0;0;0;0 | 12845984;3211488;12845984;3211424;12845984;3211744;12845984;3211424;12845984;3211424 | 13037184;3293696;13178336;3290912;13119264;3288288;13043616;3300896;13088096;3306176 | |
57 | densenet0_stage2_batchnorm2_fwd | BatchNorm | [32,160,28,28] | 961.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 45.67 | 25395200 | 15905802.67 | 15981546.67 | 81.80 | 0.80 | 556.10 | true | 0.818240;0.815244;0.820090;0.816657;0.818870 | 25395200;25395200;25395200;25395200;25395200 | 15906592;15907744;15902496;15913312;15903072 | 15987360;15982592;15979520;15981280;15980768 | |
58 | densenet0_stage2_relu2_fwd | Activation | [32,160,28,28] | 889.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 45.00 | 8028160 | 16056842.67 | 15815616.00 | 94.30 | 0.25 | 178.40 | true | 0.942399;0.941278;0.942738;0.946813;0.942970 | 8028160;8028160;8028160;8028160;8028160 | 16056672;16056928;16056928;16056928;16056672 | 15811648;15816832;15824896;15818368;15810688 | |
59 | densenet0_stage2_conv2_fwd | Convolution | [32,160,28,28] | 61431.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 98.00 | 1034027008 | 28529002.67 | 13102144.00 | 20.50 | 24.84 | 10551.30 | false | 0.205632;0.203509;0.206023;0.204614;0.203492 | 1034027008;1034027008;1034027008;1034027008;1034027008 | 28786048;29292736;27825664;27368448;28975296 | 13116960;13107264;13082208;13078592;13128800 | |
59 | densenet0_stage2_conv2_fwd | Convolution | [32,160,28,28] | 61431.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 608.00 | 7850.67 | 5.90 | 0.00 | 0.00 | true | 0.058275;0.059381;0.059366;0.057839;0.059357 | 0;0;0;0;0 | 7680;7936;7680;7936;7936 | 608;608;352;608;608 | |
60 | densenet0_stage2_batchnorm3_fwd | BatchNorm | [32,128,28,28] | 704 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.00 | 20316160 | 12782282.67 | 12818282.67 | 81.20 | 0.79 | 549.09 | true | 0.813900;0.811806;0.812452;0.812154;0.812283 | 20316160;20316160;20316160;20316160;20316160 | 12765920;12811808;12769120;12749920;12819744 | 12808544;12806304;12840000;12840416;12800800 | |
61 | densenet0_stage2_relu3_fwd | Activation | [32,128,28,28] | 701.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.67 | 6422528 | 12845408.00 | 12614229.33 | 94.20 | 0.25 | 180.07 | true | 0.943121;0.938034;0.939924;0.943666;0.944430 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845664 | 12614016;12612096;12622528;12616576;12607552 | |
62 | densenet0_stage2_conv3_fwd | Convolution | [32,128,28,28] | 153562.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.00 | 985300992 | 29262581.33 | 7636917.33 | 21.00 | 26.70 | 11196.60 | false | 0.209509;0.210692;0.209746;0.210871;0.210319 | 985300992;985300992;985300992;985300992;985300992 | 7549280;7658912;7637312;7614528;7703968 | 29230240;29257696;29266976;29263072;29267232 | |
62 | densenet0_stage2_conv3_fwd | Convolution | [32,128,28,28] | 153562.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 63.67 | 48168960 | 12728522.67 | 29216661.33 | 42.80 | 1.15 | 756.58 | true | 0.430963;0.430321;0.425440;0.427312;0.425811 | 48168960;48168960;48168960;48168960;48168960 | 12704992;12703648;12750560;12996384;12730016 | 29230048;29189600;29188448;29236768;29230336 | |
62 | densenet0_stage2_conv3_fwd | Convolution | [32,128,28,28] | 153562.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5781024.00 | 3463114.67 | 36.00 | 1.79 | 920.35 | true | 0.358871;0.363968;0.361506;0.360900;0.358691 | 16566272;16566272;16566272;16566272;16566272 | 5659456;5825856;5716704;5800512;5903008 | 3501376;3421248;3520832;3466720;3320160 | |
62 | densenet0_stage2_conv3_fwd | Convolution | [32,128,28,28] | 153562.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147904.00 | 365493.33 | 12.30 | 0.65 | 66.36 | true | 0.123567;0.123430;0.123288;0.123312;0.123450 | 331776;331776;331776;331776;331776 | 147904;147904;147904;147904;147904 | 366816;364768;364896;363168;370144 | |
63 | densenet0_stage2_concat1 | Concat | [32,160,28,28] | 1253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 32.14 | 0 | 9633952.00 | 9675034.67 | 82.40 | 0.00 | 0.00 | true | 0.850626;0.790300;0.850329;0.783436;0.849980;0.792982;0.852003;0.802097;0.852034;0.798934 | 0;0;0;0;0;0;0;0;0;0 | 16056480;3211424;16058272;3211424;16056480;3211424;16056480;3211424;16056480;3211424 | 16047360;3314784;16067648;3308256;16001152;3299776;16081248;3311008;16121664;3305792 | |
63 | densenet0_stage2_concat1 | Concat | [32,160,28,28] | 1253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 27.29 | 0 | 9633952.00 | 9675034.67 | 82.40 | 0.00 | 0.00 | true | 0.850626;0.790300;0.850329;0.783436;0.849980;0.792982;0.852003;0.802097;0.852034;0.798934 | 0;0;0;0;0;0;0;0;0;0 | 16056480;3211424;16058272;3211424;16056480;3211424;16056480;3211424;16056480;3211424 | 16047360;3314784;16067648;3308256;16001152;3299776;16081248;3311008;16121664;3305792 | |
64 | densenet0_stage2_batchnorm4_fwd | BatchNorm | [32,192,28,28] | 1098.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 54.00 | 30474240 | 19171104.00 | 19216128.00 | 82.20 | 0.79 | 564.34 | true | 0.822039;0.824660;0.822465;0.821650;0.822656 | 30474240;30474240;30474240;30474240;30474240 | 19224544;19213376;19217184;19198240;19217824 | 19172576;19171872;19168864;19168608;19176096 | |
65 | densenet0_stage2_relu4_fwd | Activation | [32,192,28,28] | 1093.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 53.00 | 9633792 | 19267936.00 | 19036224.00 | 94.30 | 0.25 | 181.77 | true | 0.943262;0.942719;0.941703;0.944274;0.944749 | 9633792;9633792;9633792;9633792;9633792 | 19267936;19267936;19267936;19267936;19267936 | 19028608;19040704;19034432;19036096;19038144 | |
66 | densenet0_stage2_conv4_fwd | Convolution | [32,192,28,28] | 68348.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 114.33 | 1239547904 | 35277845.33 | 13172522.67 | 20.60 | 25.58 | 10841.56 | false | 0.207566;0.207542;0.205302;0.206423;0.203777 | 1239547904;1239547904;1239547904;1239547904;1239547904 | 35788736;35873728;35082816;34961984;33910464 | 13260032;13149440;13105504;13108096;13271072 | |
66 | densenet0_stage2_conv4_fwd | Convolution | [32,192,28,28] | 68348.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059364;0.059405;0.059365;0.059385;0.059358 | 0;0;0;0;0 | 96;96;5472;96;96 | 7296;7424;19968;7424;7424 | |
67 | densenet0_stage2_batchnorm5_fwd | BatchNorm | [32,128,28,28] | 744.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.00 | 20316160 | 12816480.00 | 12745109.33 | 81.20 | 0.79 | 549.09 | true | 0.811267;0.812409;0.811498;0.813576;0.811440 | 20316160;20316160;20316160;20316160;20316160 | 12786848;12846752;12832288;12830304;12738400 | 12668064;12765824;12801440;12816192;12649312 | |
68 | densenet0_stage2_relu5_fwd | Activation | [32,128,28,28] | 709.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.33 | 6422528 | 12845408.00 | 12612288.00 | 93.90 | 0.25 | 181.77 | true | 0.938617;0.940447;0.940186;0.938792;0.939440 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12611264;12616832;12612608;12610048;12612992 | |
69 | densenet0_stage2_conv5_fwd | Convolution | [32,128,28,28] | 152934 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.00 | 985300992 | 29266869.33 | 7499040.00 | 21.00 | 26.80 | 11196.60 | false | 0.212025;0.210184;0.207367;0.213048;0.207476 | 985300992;985300992;985300992;985300992;985300992 | 29280736;29272480;29220192;29249568;29278560 | 7505248;7493344;7498528;7457664;7663776 | |
69 | densenet0_stage2_conv5_fwd | Convolution | [32,128,28,28] | 152934 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 64.33 | 48168960 | 12812448.00 | 29221770.67 | 42.90 | 1.15 | 748.74 | true | 0.430012;0.428498;0.429192;0.430039;0.427720 | 48168960;48168960;48168960;48168960;48168960 | 12744544;12845856;12780768;12810720;12940896 | 29220000;29239136;29224288;29221024;29199776 | |
69 | densenet0_stage2_conv5_fwd | Convolution | [32,128,28,28] | 152934 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5637952.00 | 3530933.33 | 36.20 | 1.81 | 920.35 | true | 0.363484;0.361737;0.363269;0.359611;0.360917 | 16566272;16566272;16566272;16566272;16566272 | 5552128;5699104;5564768;5649984;5736128 | 3537184;3564256;3491360;3597120;3325568 | |
69 | densenet0_stage2_conv5_fwd | Convolution | [32,128,28,28] | 152934 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 374794.67 | 12.30 | 0.64 | 66.36 | true | 0.123375;0.123556;0.123292;0.123612;0.123306 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;152768 | 350624;352544;378848;407200;392992 | |
70 | densenet0_stage2_concat2 | Concat | [32,192,28,28] | 1421.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 37.00 | 0 | 11242080.00 | 11294330.67 | 81.90 | 0.00 | 0.00 | true | 0.855027;0.784147;0.854570;0.782574;0.854014;0.788231;0.854096;0.774649;0.852101;0.778829 | 0;0;0;0;0;0;0;0;0;0 | 19267744;3211488;19267744;3211488;19267744;3211424;19267744;3226272;19267744;3211424 | 19328096;3271936;19320096;3279872;19330912;3274176;19283552;3280192;19360608;3240256 | |
70 | densenet0_stage2_concat2 | Concat | [32,192,28,28] | 1421.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 31.00 | 0 | 11242080.00 | 11294330.67 | 81.90 | 0.00 | 0.00 | true | 0.855027;0.784147;0.854570;0.782574;0.854014;0.788231;0.854096;0.774649;0.852101;0.778829 | 0;0;0;0;0;0;0;0;0;0 | 19267744;3211488;19267744;3211488;19267744;3211424;19267744;3226272;19267744;3211424 | 19328096;3271936;19320096;3279872;19330912;3274176;19283552;3280192;19360608;3240256 | |
71 | densenet0_stage2_batchnorm6_fwd | BatchNorm | [32,224,28,28] | 1417.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 61.33 | 35553280 | 22398922.67 | 22438474.67 | 82.50 | 0.79 | 579.68 | true | 0.824144;0.824860;0.824592;0.824780;0.825487 | 35553280;35553280;35553280;35553280;35553280 | 22437856;22437216;22440352;22426848;22472288 | 22402592;22398624;22395872;22402272;22391264 | |
72 | densenet0_stage2_relu6_fwd | Activation | [32,224,28,28] | 1212.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 61.00 | 11239424 | 22479200.00 | 22239317.33 | 94.50 | 0.25 | 184.25 | true | 0.946617;0.943069;0.943725;0.946260;0.944816 | 11239424;11239424;11239424;11239424;11239424 | 22479200;22479200;22479200;22479200;22479200 | 22240384;22238976;22237120;22238592;22241664 | |
73 | densenet0_stage2_conv6_fwd | Convolution | [32,224,28,28] | 88078.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 133.33 | 1445068800 | 41674154.67 | 13306186.67 | 20.70 | 26.28 | 10838.04 | false | 0.208522;0.207093;0.206883;0.204808;0.207295 | 1445068800;1445068800;1445068800;1445068800;1445068800 | 41927040;40830144;42347776;40145472;42265280 | 13403616;13177664;13373056;13228992;13316512 | |
73 | densenet0_stage2_conv6_fwd | Convolution | [32,224,28,28] | 88078.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7893.33 | 5.90 | 0.00 | 0.00 | true | 0.059368;0.059399;0.059375;0.059403;0.059354 | 0;0;0;0;0 | 96;96;96;96;96 | 8448;7424;7808;7424;14080 | |
74 | densenet0_stage2_batchnorm7_fwd | BatchNorm | [32,128,28,28] | 785 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12817418.67 | 12601109.33 | 81.10 | 0.80 | 564.34 | true | 0.812723;0.811919;0.810637;0.811532;0.809113 | 20316160;20316160;20316160;20316160;20316160 | 12512544;12730400;12540800;12684992;12577536 | 12842464;12774432;12846752;12670496;12835360 | |
75 | densenet0_stage2_relu7_fwd | Activation | [32,128,28,28] | 737.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.67 | 6422528 | 12845408.00 | 12614912.00 | 94.00 | 0.25 | 180.07 | true | 0.942513;0.941084;0.939823;0.939790;0.938825 | 6422528;6422528;6422528;6422528;6422528 | 12611776;12612544;12617920;12620736;12614272 | 12845408;12845408;12845408;12845408;12845408 | |
76 | densenet0_stage2_conv7_fwd | Convolution | [32,128,28,28] | 153436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 87.67 | 985300992 | 29255008.00 | 7662784.00 | 20.90 | 26.69 | 11239.13 | false | 0.209263;0.209971;0.207663;0.209708;0.209197 | 985300992;985300992;985300992;985300992;985300992 | 29268064;29288224;29209248;29230176;29266784 | 7731168;7411712;7694848;7973088;7562336 | |
76 | densenet0_stage2_conv7_fwd | Convolution | [32,128,28,28] | 153436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 62.67 | 48168960 | 12739872.00 | 29200213.33 | 42.90 | 1.15 | 768.65 | true | 0.429397;0.429920;0.425086;0.428339;0.431326 | 48168960;48168960;48168960;48168960;48168960 | 12671520;12877472;12754080;12665120;12794016 | 29194656;29264576;29234848;29171136;29170432 | |
76 | densenet0_stage2_conv7_fwd | Convolution | [32,128,28,28] | 153436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.33 | 16566272 | 5695466.67 | 3389322.67 | 36.10 | 1.82 | 903.63 | true | 0.363019;0.361270;0.359124;0.362600;0.360154 | 16566272;16566272;16566272;16566272;16566272 | 5679520;5692288;5688288;5777280;5705824 | 3299840;3717664;3344992;3074144;3523136 | |
76 | densenet0_stage2_conv7_fwd | Convolution | [32,128,28,28] | 153436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 382965.33 | 12.30 | 0.63 | 66.36 | true | 0.123303;0.123389;0.123402;0.123284;0.123646 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;147648 | 380064;374304;379808;389024;390432 | |
77 | densenet0_stage2_concat3 | Concat | [32,224,28,28] | 1745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 42.00 | 0 | 12845216.00 | 12917925.33 | 81.90 | 0.00 | 0.00 | true | 0.856394;0.777622;0.856687;0.779265;0.855851;0.784637;0.856469;0.781734;0.855322;0.776442 | 0;0;0;0;0;0;0;0;0;0 | 22479008;3211424;22479008;3211424;22479008;3211424;22479008;3211424;22479008;3211424 | 22576032;3303200;22481792;3317024;22535200;3297952;22573376;3297056;22565696;3304640 | |
77 | densenet0_stage2_concat3 | Concat | [32,224,28,28] | 1745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 34.71 | 0 | 12845216.00 | 12917925.33 | 81.90 | 0.00 | 0.00 | true | 0.856394;0.777622;0.856687;0.779265;0.855851;0.784637;0.856469;0.781734;0.855322;0.776442 | 0;0;0;0;0;0;0;0;0;0 | 22479008;3211424;22479008;3211424;22479008;3211424;22479008;3211424;22479008;3211424 | 22576032;3303200;22481792;3317024;22535200;3297952;22573376;3297056;22565696;3304640 | |
78 | densenet0_stage2_batchnorm8_fwd | BatchNorm | [32,256,28,28] | 1862.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 69.00 | 40632320 | 25606112.00 | 25592640.00 | 82.90 | 0.79 | 588.87 | true | 0.827123;0.829582;0.828718;0.828633;0.828514 | 40632320;40632320;40632320;40632320;40632320 | 25599456;25607264;25608800;25607136;25603936 | 25597024;25591200;25589696;25601856;25586272 | |
79 | densenet0_stage2_relu8_fwd | Activation | [32,256,28,28] | 1836.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 70.00 | 12845056 | 25690464.00 | 25458122.67 | 94.50 | 0.25 | 183.50 | true | 0.946039;0.943734;0.944631;0.946920;0.943149 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25457920;25456832;25454656;25459616;25465984 | |
80 | densenet0_stage2_conv8_fwd | Convolution | [32,256,28,28] | 96223.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 154.33 | 1650589696 | 47011029.33 | 13510453.33 | 20.80 | 27.27 | 10694.99 | false | 0.211212;0.206234;0.206147;0.207528;0.208990 | 1650589696;1650589696;1650589696;1650589696;1650589696 | 46974720;48223040;46539456;47518912;46417856 | 13473664;13462912;13522880;13534816;13568384 | |
80 | densenet0_stage2_conv8_fwd | Convolution | [32,256,28,28] | 96223.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059371;0.059392;0.059370;0.059382;0.059360 | 0;0;0;0;0 | 7296;7424;7424;7424;7424 | 96;96;96;96;96 | |
81 | densenet0_stage2_batchnorm9_fwd | BatchNorm | [32,128,28,28] | 933.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12812896.00 | 12413280.00 | 81.00 | 0.81 | 564.34 | true | 0.810173;0.810431;0.809310;0.811736;0.807685 | 20316160;20316160;20316160;20316160;20316160 | 12446240;12450144;12408416;12385184;12349472 | 12774624;12827936;12791392;12819360;12828448 | |
82 | densenet0_stage2_relu9_fwd | Activation | [32,128,28,28] | 952.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.33 | 6422528 | 12845408.00 | 12609642.67 | 93.90 | 0.25 | 181.77 | true | 0.937328;0.941545;0.940413;0.939676;0.936617 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12611200;12618752;12600960;12605888;12611840 | |
83 | densenet0_stage2_conv9_fwd | Convolution | [32,128,28,28] | 153487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.00 | 985300992 | 29251296.00 | 7617472.00 | 20.90 | 26.72 | 11196.60 | false | 0.208160;0.208352;0.209939;0.207012;0.211931 | 985300992;985300992;985300992;985300992;985300992 | 7681504;7535072;7452352;7635840;7776864 | 29230688;29267360;29261792;29251168;29240928 | |
83 | densenet0_stage2_conv9_fwd | Convolution | [32,128,28,28] | 153487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 64.33 | 48168960 | 12798090.67 | 29221610.67 | 42.90 | 1.15 | 748.74 | true | 0.428654;0.428377;0.427788;0.428610;0.429710 | 48168960;48168960;48168960;48168960;48168960 | 12871264;12957984;12726752;12717024;12796256 | 29205088;29256416;29251072;29208672;29194880 | |
83 | densenet0_stage2_conv9_fwd | Convolution | [32,128,28,28] | 153487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5701493.33 | 3390634.67 | 36.00 | 1.82 | 920.35 | true | 0.361261;0.359023;0.358418;0.363043;0.359512 | 16566272;16566272;16566272;16566272;16566272 | 5789120;5652256;5663104;5610080;5973568 | 3382560;3430400;3541152;3358944;3358400 | |
83 | densenet0_stage2_conv9_fwd | Convolution | [32,128,28,28] | 153487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 391370.67 | 12.30 | 0.62 | 66.36 | true | 0.123498;0.123357;0.123404;0.123544;0.123372 | 331776;331776;331776;331776;331776 | 392864;373536;406560;398368;382880 | 147648;147648;147648;147648;147648 | |
84 | densenet0_stage2_concat4 | Concat | [32,256,28,28] | 2706.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 46.71 | 0 | 14450869.33 | 14502277.33 | 82.20 | 0.00 | 0.00 | true | 0.857699;0.785676;0.858710;0.785771;0.858668;0.780430;0.857997;0.784617;0.857877;0.781447 | 0;0;0;0;0;0;0;0;0;0 | 25690272;3211424;25690272;3211424;25690272;3211488;25690272;3211488;25693600;3211424 | 25675008;3324704;25731648;3312704;25727296;3305376;25728288;3323552;25644512;3318592 | |
84 | densenet0_stage2_concat4 | Concat | [32,256,28,28] | 2706.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 38.29 | 0 | 14450869.33 | 14502277.33 | 82.20 | 0.00 | 0.00 | true | 0.857699;0.785676;0.858710;0.785771;0.858668;0.780430;0.857997;0.784617;0.857877;0.781447 | 0;0;0;0;0;0;0;0;0;0 | 25690272;3211424;25690272;3211424;25690272;3211488;25690272;3211488;25693600;3211424 | 25675008;3324704;25731648;3312704;25727296;3305376;25728288;3323552;25644512;3318592 | |
85 | densenet0_stage2_batchnorm10_fwd | BatchNorm | [32,288,28,28] | 2653 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 77.00 | 45711360 | 28851680.00 | 28836352.00 | 83.00 | 0.79 | 593.65 | true | 0.829872;0.829778;0.830549;0.828912;0.830055 | 45711360;45711360;45711360;45711360;45711360 | 28858208;28853920;28844512;28846368;28854752 | 28823968;28839680;28842464;28833888;28835488 | |
86 | densenet0_stage2_relu10_fwd | Activation | [32,288,28,28] | 2435 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 77.33 | 14450688 | 28901728.00 | 28672384.00 | 94.50 | 0.25 | 186.86 | true | 0.945255;0.944532;0.945357;0.946521;0.945550 | 14450688;14450688;14450688;14450688;14450688 | 28901728;28901728;28901728;28915296;28901728 | 28671936;28668416;28671616;28678656;28673600 | |
87 | densenet0_stage2_conv10_fwd | Convolution | [32,288,28,28] | 103450.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 171.00 | 1856110592 | 53175594.67 | 13544704.00 | 21.10 | 27.82 | 10854.45 | false | 0.209309;0.212729;0.210790;0.211808;0.211099 | 1856110592;1856110592;1856110592;1856110592;1856110592 | 52898432;53673856;53295552;53110080;53121152 | 13556032;13518880;13456160;13559200;13566784 | |
87 | densenet0_stage2_conv10_fwd | Convolution | [32,288,28,28] | 103450.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059371;0.059388;0.059358;0.059464;0.059367 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7424;7424;7424 | |
88 | densenet0_stage2_batchnorm11_fwd | BatchNorm | [32,128,28,28] | 1086.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12772576.00 | 12370869.33 | 80.80 | 0.81 | 564.34 | true | 0.807509;0.808701;0.808676;0.809718;0.805295 | 20316160;20316160;20316160;20316160;20316160 | 12777120;12766624;12742752;12776160;12774944 | 12352608;12401440;12455136;12348000;12358560 | |
89 | densenet0_stage2_relu11_fwd | Activation | [32,128,28,28] | 1113.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12610229.33 | 94.10 | 0.25 | 178.40 | true | 0.941351;0.940653;0.943477;0.940191;0.940087 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12621696;12611584;12610528;12608576;12601792 | |
90 | densenet0_stage2_conv11_fwd | Convolution | [32,128,28,28] | 153919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.33 | 985300992 | 29269002.67 | 7638144.00 | 20.90 | 26.70 | 11154.39 | false | 0.207147;0.207209;0.211076;0.209999;0.209522 | 985300992;985300992;985300992;985300992;985300992 | 29293856;29276960;29214752;29266080;29263968 | 7681824;7627872;7732896;7604736;7428512 | |
90 | densenet0_stage2_conv11_fwd | Convolution | [32,128,28,28] | 153919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 63.33 | 48168960 | 12792672.00 | 29217184.00 | 42.70 | 1.15 | 760.57 | true | 0.422962;0.427495;0.427651;0.425369;0.427892 | 48168960;48168960;48168960;48168960;48168960 | 29236160;29252544;29203840;29172800;29211552 | 13110688;12733728;12791072;12740960;12845984 | |
90 | densenet0_stage2_conv11_fwd | Convolution | [32,128,28,28] | 153919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.67 | 16566272 | 5778592.00 | 3411818.67 | 36.10 | 1.80 | 887.46 | true | 0.361871;0.360658;0.361526;0.358456;0.363662 | 16566272;16566272;16566272;16566272;16566272 | 5787232;5705536;5865120;5843008;5551904 | 3301600;3344032;3335552;3555872;3644352 | |
90 | densenet0_stage2_conv11_fwd | Convolution | [32,128,28,28] | 153919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 382496.00 | 12.30 | 0.63 | 66.36 | true | 0.123315;0.123592;0.123521;0.123418;0.123476 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;147648 | 365344;376224;383520;412576;387744 | |
91 | densenet0_stage2_concat5 | Concat | [32,288,28,28] | 3087 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 51.57 | 0 | 16056480.00 | 16095269.33 | 82.70 | 0.00 | 0.00 | true | 0.859827;0.800603;0.857652;0.785670;0.860226;0.789905;0.858944;0.794609;0.859349;0.782734 | 0;0;0;0;0;0;0;0;0;0 | 28982752;3255744;28954656;3265472;28967520;3271488;28838464;3273152;28947520;3286336 | 28901536;3211424;28907168;3211424;28901536;3211424;28901536;3211424;28901536;3211424 | |
91 | densenet0_stage2_concat5 | Concat | [32,288,28,28] | 3087 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 41.86 | 0 | 16056480.00 | 16095269.33 | 82.70 | 0.00 | 0.00 | true | 0.859827;0.800603;0.857652;0.785670;0.860226;0.789905;0.858944;0.794609;0.859349;0.782734 | 0;0;0;0;0;0;0;0;0;0 | 28901536;3211424;28907168;3211424;28901536;3211424;28901536;3211424;28901536;3211424 | 28982752;3255744;28954656;3265472;28967520;3271488;28838464;3273152;28947520;3286336 | |
92 | densenet0_stage2_batchnorm12_fwd | BatchNorm | [32,320,28,28] | 2971.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 86.33 | 50790400 | 32017930.67 | 32082720.00 | 83.20 | 0.79 | 588.31 | true | 0.831233;0.832077;0.832340;0.832612;0.832203 | 50790400;50790400;50790400;50790400;50790400 | 32014944;32009440;32022112;32020960;32017888 | 32118752;32086496;32077088;32084576;32075936 | |
93 | densenet0_stage2_relu12_fwd | Activation | [32,320,28,28] | 2706.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 86.33 | 16056320 | 32112992.00 | 31871936.00 | 94.70 | 0.25 | 185.98 | true | 0.946600;0.947045;0.948709;0.946904;0.944638 | 16056320;16056320;16056320;16056320;16056320 | 32112992;32112992;32115040;32112992;32112992 | 31852672;31873152;31882048;31872128;31870528 | |
94 | densenet0_stage2_conv12_fwd | Convolution | [32,320,28,28] | 123164.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 188.67 | 2061631488 | 59828522.67 | 13439413.33 | 21.10 | 28.14 | 10927.36 | false | 0.212538;0.210034;0.209735;0.211661;0.211557 | 2061631488;2061631488;2061631488;2061631488;2061631488 | 13248992;13441824;13363232;13564096;13513184 | 60369280;59866240;60105152;59514176;59438976 | |
94 | densenet0_stage2_conv12_fwd | Convolution | [32,320,28,28] | 123164.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059371;0.059394;0.059351;0.059385;0.059371 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7424;7424;7424 | |
95 | densenet0_stage2_batchnorm13_fwd | BatchNorm | [32,128,28,28] | 1169.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12808245.33 | 12476725.33 | 81.00 | 0.80 | 564.34 | true | 0.809505;0.810536;0.807193;0.810472;0.811658 | 20316160;20316160;20316160;20316160;20316160 | 12784032;12814816;12789728;12820192;12821600 | 12667040;12486240;12530272;12354720;12413664 | |
96 | densenet0_stage2_relu13_fwd | Activation | [32,128,28,28] | 1140 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12610176.00 | 93.90 | 0.25 | 178.40 | true | 0.939126;0.938876;0.940234;0.936532;0.939955 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12616128;12605120;12612288;12613120;12604800 | |
97 | densenet0_stage2_conv13_fwd | Convolution | [32,128,28,28] | 153370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.33 | 985300992 | 29255434.67 | 7772458.67 | 21.10 | 26.61 | 11154.39 | false | 0.206281;0.212781;0.210656;0.209125;0.211801 | 985300992;985300992;985300992;985300992;985300992 | 29233120;29271456;29269088;29252832;29244384 | 7898208;7692960;7534144;7889920;7734496 | |
97 | densenet0_stage2_conv13_fwd | Convolution | [32,128,28,28] | 153370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 65.67 | 48168960 | 12714165.33 | 29211317.33 | 42.90 | 1.15 | 733.53 | true | 0.430144;0.429000;0.429175;0.427814;0.428058 | 48168960;48168960;48168960;48168960;48168960 | 12698848;12682592;12741600;12737696;12705952 | 29274720;29193504;29220416;29198144;29215392 | |
97 | densenet0_stage2_conv13_fwd | Convolution | [32,128,28,28] | 153370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5817920.00 | 3286677.33 | 36.00 | 1.82 | 920.35 | true | 0.360242;0.357858;0.361913;0.359323;0.360558 | 16566272;16566272;16566272;16566272;16566272 | 5806240;5965312;5621344;5855872;5791648 | 3023168;3410272;3493216;3190496;3259264 | |
97 | densenet0_stage2_conv13_fwd | Convolution | [32,128,28,28] | 153370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 397685.33 | 12.30 | 0.61 | 66.36 | true | 0.123339;0.123499;0.123329;0.123434;0.123501 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;147648 | 396832;396448;382624;399776;405152 | |
98 | densenet0_stage2_concat6 | Concat | [32,320,28,28] | 3356.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 56.86 | 0 | 17662144.00 | 17723498.67 | 82.50 | 0.00 | 0.00 | true | 0.860669;0.780704;0.861046;0.780314;0.861477;0.786153;0.861583;0.792761;0.861031;0.789066 | 0;0;0;0;0;0;0;0;0;0 | 32231968;3228064;32217440;3245312;32218272;3229664;32161760;3254016;32220512;3244192 | 32112800;3211488;32112800;3211488;32112800;3211424;32112800;3211424;32112800;3211488 | |
98 | densenet0_stage2_concat6 | Concat | [32,320,28,28] | 3356.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 46.00 | 0 | 17662144.00 | 17723498.67 | 82.50 | 0.00 | 0.00 | true | 0.860669;0.780704;0.861046;0.780314;0.861477;0.786153;0.861583;0.792761;0.861031;0.789066 | 0;0;0;0;0;0;0;0;0;0 | 32231968;3228064;32217440;3245312;32218272;3229664;32161760;3254016;32220512;3244192 | 32112800;3211488;32112800;3211488;32112800;3211424;32112800;3211424;32112800;3211488 | |
99 | densenet0_stage2_batchnorm14_fwd | BatchNorm | [32,352,28,28] | 3448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 94.00 | 55869440 | 35304202.67 | 35285248.00 | 83.30 | 0.79 | 594.36 | true | 0.833380;0.834234;0.832771;0.832548;0.832635 | 55869440;55869440;55869440;55869440;55869440 | 35304224;35302816;35301216;35305568;35305888 | 35284512;35287264;35296672;35271520;35283968 | |
100 | densenet0_stage2_relu14_fwd | Activation | [32,352,28,28] | 2952.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 94.67 | 17661952 | 35325024.00 | 35091776.00 | 95.30 | 0.25 | 186.57 | true | 0.951810;0.954526;0.952565;0.952400;0.952839 | 17661952;17661952;17661952;17661952;17661952 | 35324256;35326560;35324256;35330144;35324256 | 35107520;35087872;35085632;35101824;35081472 | |
101 | densenet0_stage2_conv14_fwd | Convolution | [32,352,28,28] | 132305.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 212.33 | 2267152384 | 66245738.67 | 13484096.00 | 21.10 | 28.44 | 10677.34 | false | 0.212119;0.210697;0.210775;0.215153;0.210014 | 2267152384;2267152384;2267152384;2267152384;2267152384 | 66988544;65489024;65978944;66430080;66328192 | 13321824;13524384;13487360;13462752;13502176 | |
101 | densenet0_stage2_conv14_fwd | Convolution | [32,352,28,28] | 132305.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059364;0.059380;0.059358;0.059384;0.059448 | 0;0;0;0;0 | 96;96;96;96;96 | 7424;7424;7424;7424;7424 | |
102 | densenet0_stage2_batchnorm15_fwd | BatchNorm | [32,128,28,28] | 1241 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.33 | 20316160 | 12805365.33 | 12423946.67 | 81.00 | 0.81 | 559.17 | true | 0.808848;0.809141;0.811134;0.812424;0.808354 | 20316160;20316160;20316160;20316160;20316160 | 12566048;12383136;12435872;12412576;12423392 | 12844896;12791456;12801824;12792672;12821600 | |
103 | densenet0_stage2_relu15_fwd | Activation | [32,128,28,28] | 1127 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12616000.00 | 94.00 | 0.25 | 178.40 | true | 0.937178;0.945857;0.939649;0.940935;0.940539 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12852576 | 12614080;12615744;12608768;12618176;12622784 | |
104 | densenet0_stage2_conv15_fwd | Convolution | [32,128,28,28] | 153840.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.00 | 985300992 | 29252554.67 | 7528053.33 | 21.10 | 26.79 | 11196.60 | false | 0.211857;0.209183;0.211220;0.210008;0.211511 | 985300992;985300992;985300992;985300992;985300992 | 29256224;29246880;29232416;29254560;29279584 | 7413408;7760160;7921216;7366208;7410592 | |
104 | densenet0_stage2_conv15_fwd | Convolution | [32,128,28,28] | 153840.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 63.33 | 48168960 | 12823776.00 | 29248213.33 | 43.00 | 1.14 | 760.57 | true | 0.430116;0.429743;0.431468;0.427474;0.430980 | 48168960;48168960;48168960;48168960;48168960 | 29233184;29226272;29252768;29258688;29267296 | 12783520;12862176;12947232;12698592;12825632 | |
104 | densenet0_stage2_conv15_fwd | Convolution | [32,128,28,28] | 153840.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.33 | 16566272 | 5640768.00 | 3524032.00 | 36.00 | 1.81 | 903.63 | true | 0.358320;0.360800;0.363701;0.359249;0.360652 | 16566272;16566272;16566272;16566272;16566272 | 3582144;3341280;3103296;3648672;3686368 | 5465376;5827744;5770176;5506368;5645760 | |
104 | densenet0_stage2_conv15_fwd | Convolution | [32,128,28,28] | 153840.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 361034.67 | 12.30 | 0.65 | 66.36 | true | 0.123477;0.123556;0.123325;0.123472;0.123381 | 331776;331776;331776;331776;331776 | 147648;149696;147648;147648;147648 | 370848;369760;356256;357088;350176 | |
105 | densenet0_stage2_concat7 | Concat | [32,352,28,28] | 3721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 61.71 | 0 | 19267744.00 | 19305098.67 | 82.40 | 0.00 | 0.00 | true | 0.860611;0.787699;0.861132;0.783045;0.861214;0.783852;0.860743;0.785800;0.862432;0.789119 | 0;0;0;0;0;0;0;0;0;0 | 35324064;3211424;35324064;3211424;35324064;3211424;35324064;3211424;35324064;3211424 | 35427520;3260736;35371680;3266752;35373056;3262784;35376608;3275456;35280864;3253824 | |
105 | densenet0_stage2_concat7 | Concat | [32,352,28,28] | 3721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 49.57 | 0 | 19267744.00 | 19305098.67 | 82.40 | 0.00 | 0.00 | true | 0.860611;0.787699;0.861132;0.783045;0.861214;0.783852;0.860743;0.785800;0.862432;0.789119 | 0;0;0;0;0;0;0;0;0;0 | 35324064;3211424;35324064;3211424;35324064;3211424;35324064;3211424;35324064;3211424 | 35427520;3260736;35371680;3266752;35373056;3262784;35376608;3275456;35280864;3253824 | |
106 | densenet0_stage2_batchnorm16_fwd | BatchNorm | [32,384,28,28] | 3683.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 102.00 | 60948480 | 38512010.67 | 38499637.33 | 83.50 | 0.79 | 597.53 | true | 0.835923;0.835322;0.836019;0.834706;0.834738 | 60948480;60948480;60948480;60948480;60948480 | 38510944;38516192;38511968;38511456;38512608 | 38504672;38517152;38497664;38491680;38496576 | |
107 | densenet0_stage2_relu16_fwd | Activation | [32,384,28,28] | 3236 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 102.33 | 19267584 | 38535520.00 | 38303274.67 | 95.40 | 0.25 | 188.28 | true | 0.953841;0.954392;0.954707;0.954879;0.953572 | 19267584;19267584;19267584;19267584;19267584 | 38535520;38535520;38535520;38535520;38535520 | 38302272;38286912;38304128;38303424;38310208 | |
108 | densenet0_stage2_conv16_fwd | Convolution | [32,384,28,28] | 138842.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 241.33 | 2472673280 | 73117909.33 | 13277514.67 | 21.50 | 28.62 | 10245.90 | false | 0.215117;0.218486;0.213436;0.213579;0.215005 | 2472673280;2472673280;2472673280;2472673280;2472673280 | 73455808;73405696;72435968;73086144;72861888 | 13249472;13166240;13175168;13407904;13419136 | |
108 | densenet0_stage2_conv16_fwd | Convolution | [32,384,28,28] | 138842.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059368;0.059387;0.059365;0.059399;0.059361 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7424;7424;7424 | |
109 | densenet0_stage2_batchnorm17_fwd | BatchNorm | [32,128,28,28] | 1187 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.33 | 20316160 | 12809888.00 | 12641664.00 | 81.10 | 0.80 | 559.17 | true | 0.810454;0.810290;0.813680;0.810538;0.811415 | 20316160;20316160;20316160;20316160;20316160 | 12817760;12796448;12788448;12815456;12830112 | 12672960;12751840;12744608;12507424;12496416 | |
110 | densenet0_stage2_relu17_fwd | Activation | [32,128,28,28] | 946.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12611754.67 | 94.00 | 0.25 | 178.40 | true | 0.941954;0.938240;0.938121;0.941969;0.938768 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12610816;12613312;12603520;12612864;12611584 | |
111 | densenet0_stage2_conv17_fwd | Convolution | [32,128,28,28] | 153656.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.33 | 985300992 | 29232842.67 | 7553194.67 | 21.00 | 26.78 | 11154.39 | false | 0.210762;0.209010;0.218292;0.209121;0.208411 | 985300992;985300992;985300992;985300992;985300992 | 29251168;29224288;29236640;29216928;29237600 | 7530528;7759520;7481504;7457536;7647552 | |
111 | densenet0_stage2_conv17_fwd | Convolution | [32,128,28,28] | 153656.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 64.00 | 48168960 | 12745866.67 | 29216128.00 | 43.00 | 1.15 | 752.64 | true | 0.428490;0.430303;0.432795;0.429835;0.427727 | 48168960;48168960;48168960;48168960;48168960 | 12635744;12840864;12716320;12940896;12680416 | 29212832;29221536;29213440;29233888;29213408 | |
111 | densenet0_stage2_conv17_fwd | Convolution | [32,128,28,28] | 153656.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.33 | 16566272 | 5619456.00 | 3458560.00 | 36.20 | 1.82 | 903.63 | true | 0.363683;0.361483;0.359609;0.362735;0.363087 | 16566272;16566272;16566272;16566272;16566272 | 5537504;5851360;5459776;5589696;5731168 | 3491680;3307680;3499520;3473888;3410112 | |
111 | densenet0_stage2_conv17_fwd | Convolution | [32,128,28,28] | 153656.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 385578.67 | 12.30 | 0.62 | 66.36 | true | 0.123510;0.123488;0.123315;0.123458;0.123316 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;147648 | 375712;376992;383904;406432;395840 | |
112 | densenet0_stage2_concat8 | Concat | [32,384,28,28] | 3120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 66.00 | 0 | 20873376.00 | 20952954.67 | 82.30 | 0.00 | 0.00 | true | 0.862620;0.775559;0.861411;0.778835;0.862164;0.786799;0.861785;0.784097;0.861563;0.783674 | 0;0;0;0;0;0;0;0;0;0 | 38535328;3211424;38535328;3211424;38535328;3211424;38535328;3211424;38535328;3211424 | 38574752;3326752;38548768;3339040;38612576;3348480;38620064;3354112;38543104;3348512 | |
112 | densenet0_stage2_concat8 | Concat | [32,384,28,28] | 3120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 52.57 | 0 | 20873376.00 | 20952954.67 | 82.30 | 0.00 | 0.00 | true | 0.862620;0.775559;0.861411;0.778835;0.862164;0.786799;0.861785;0.784097;0.861563;0.783674 | 0;0;0;0;0;0;0;0;0;0 | 38535328;3211424;38535328;3211424;38535328;3211424;38535328;3211424;38535328;3211424 | 38574752;3326752;38548768;3339040;38612576;3348480;38620064;3354112;38543104;3348512 | |
113 | densenet0_stage2_batchnorm18_fwd | BatchNorm | [32,416,28,28] | 2944.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 110.00 | 66027520 | 41726474.67 | 41652266.67 | 83.60 | 0.79 | 600.25 | true | 0.836747;0.835263;0.835210;0.835711;0.837037 | 66027520;66027520;66027520;66027520;66027520 | 41725152;41727712;41724768;41727968;41726560 | 41655200;41661856;41655104;41645600;41646496 | |
114 | densenet0_stage2_relu18_fwd | Activation | [32,416,28,28] | 2254.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 110.67 | 20873216 | 41746784.00 | 41513920.00 | 95.60 | 0.25 | 188.61 | true | 0.955417;0.954884;0.957032;0.956416;0.955326 | 20873216;20873216;20873216;20873216;20873216 | 41521344;41509312;41507904;41523328;41511104 | 41746784;41746784;41746784;41746784;41746784 | |
115 | densenet0_stage2_conv18_fwd | Convolution | [32,416,28,28] | 158682.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 240.67 | 2678194176 | 79752725.33 | 13249845.33 | 21.30 | 28.80 | 11128.22 | false | 0.213244;0.214924;0.213448;0.212886;0.212321 | 2678194176;2678194176;2678194176;2678194176;2678194176 | 79393024;79899712;80510016;79142656;79965440 | 13305696;13391104;13286400;13157440;13124672 | |
115 | densenet0_stage2_conv18_fwd | Convolution | [32,416,28,28] | 158682.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059356;0.059394;0.059655;0.059416;0.059374 | 0;0;0;0;0 | 96;96;96;96;96 | 6912;7424;7424;7424;7424 | |
116 | densenet0_stage2_batchnorm19_fwd | BatchNorm | [32,128,28,28] | 901.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12766794.67 | 12653365.33 | 81.10 | 0.80 | 564.34 | true | 0.811233;0.810157;0.810141;0.812533;0.811094 | 20316160;20316160;20316160;20316160;20316160 | 12692128;12810272;12807456;12583712;12800800 | 12597376;12524000;12625344;12737376;12785696 | |
117 | densenet0_stage2_relu19_fwd | Activation | [32,128,28,28] | 704.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12615424.00 | 94.10 | 0.25 | 178.40 | true | 0.940480;0.940737;0.941575;0.941308;0.942804 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12847456;12845408 | 12616448;12615872;12619392;12608320;12613952 | |
118 | densenet0_stage2_conv19_fwd | Convolution | [32,128,28,28] | 153183.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 89.00 | 985300992 | 29251488.00 | 7579722.67 | 21.10 | 26.75 | 11070.80 | false | 0.209716;0.214875;0.211054;0.209081;0.211911 | 985300992;985300992;985300992;985300992;985300992 | 29257760;29241760;29254944;29226976;29291744 | 7656160;7433824;7606496;7830656;7476512 | |
118 | densenet0_stage2_conv19_fwd | Convolution | [32,128,28,28] | 153183.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 63.00 | 48168960 | 12738229.33 | 29232138.67 | 42.80 | 1.15 | 764.59 | true | 0.426369;0.430501;0.428676;0.429741;0.426297 | 48168960;48168960;48168960;48168960;48168960 | 12772512;12700192;12741984;12823776;12657184 | 29231680;29238336;29193152;29231392;29233344 | |
118 | densenet0_stage2_conv19_fwd | Convolution | [32,128,28,28] | 153183.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.67 | 16566272 | 5663157.33 | 3427562.67 | 36.20 | 1.82 | 887.46 | true | 0.363190;0.361958;0.361156;0.362513;0.359612 | 16566272;16566272;16566272;16566272;16566272 | 5727008;5517696;5609312;5803232;5653152 | 3319552;3517088;3446048;3203648;3545600 | |
118 | densenet0_stage2_conv19_fwd | Convolution | [32,128,28,28] | 153183.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147733.33 | 374965.33 | 12.30 | 0.63 | 66.36 | true | 0.123228;0.123352;0.123514;0.123390;0.123414 | 331776;331776;331776;331776;331776 | 374432;380960;369504;352928;415200 | 147648;147648;147904;147648;150464 | |
119 | densenet0_stage2_concat9 | Concat | [32,416,28,28] | 2982.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 71.00 | 0 | 22479008.00 | 22535258.67 | 82.80 | 0.00 | 0.00 | true | 0.862659;0.785082;0.863162;0.800202;0.863498;0.782300;0.863356;0.778303;0.862601;0.793090 | 0;0;0;0;0;0;0;0;0;0 | 41746592;3211424;41746592;3211424;41746592;3211424;41746592;3211424;41746592;3211424 | 41818432;3288000;41802464;3285792;41783584;3292480;41790880;3289792;41758880;3295936 | |
119 | densenet0_stage2_concat9 | Concat | [32,416,28,28] | 2982.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 56.57 | 0 | 22479008.00 | 22535258.67 | 82.80 | 0.00 | 0.00 | true | 0.862659;0.785082;0.863162;0.800202;0.863498;0.782300;0.863356;0.778303;0.862601;0.793090 | 0;0;0;0;0;0;0;0;0;0 | 41746592;3211424;41746592;3211424;41746592;3211424;41746592;3211424;41746592;3211424 | 41818432;3288000;41802464;3285792;41783584;3292480;41790880;3289792;41758880;3295936 | |
120 | densenet0_stage2_batchnorm20_fwd | BatchNorm | [32,448,28,28] | 3142 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 117.67 | 71106560 | 44946698.67 | 44934709.33 | 83.80 | 0.79 | 604.30 | true | 0.838165;0.836609;0.837955;0.837206;0.837786 | 71106560;71106560;71106560;71106560;71106560 | 44946528;44947808;44948320;44945760;44945632 | 44931232;44928352;44921760;44944544;44946016 | |
121 | densenet0_stage2_relu20_fwd | Activation | [32,448,28,28] | 2432.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 118.33 | 22478848 | 44958048.00 | 44713898.67 | 95.70 | 0.25 | 189.96 | true | 0.957032;0.956775;0.954941;0.956586;0.956297 | 22478848;22478848;22478848;22478848;22478848 | 44958048;44958048;44958048;44958048;44958048 | 44714816;44722944;44732608;44703936;44699648 | |
122 | densenet0_stage2_conv20_fwd | Convolution | [32,448,28,28] | 167345 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 281.67 | 2883715072 | 86533866.67 | 13543701.33 | 21.40 | 28.81 | 10238.03 | false | 0.218637;0.215849;0.212195;0.213794;0.210430 | 2883715072;2883715072;2883715072;2883715072;2883715072 | 87247552;86503040;85904640;86702464;86396096 | 13500736;13474560;13582240;13548128;13718912 | |
122 | densenet0_stage2_conv20_fwd | Convolution | [32,448,28,28] | 167345 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7466.67 | 5.90 | 0.00 | 0.00 | true | 0.059366;0.059393;0.059362;0.059404;0.059374 | 0;0;0;0;0 | 7168;9728;7424;7552;7424 | 96;1120;96;96;96 | |
123 | densenet0_stage2_batchnorm21_fwd | BatchNorm | [32,128,28,28] | 904.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12704096.00 | 12374389.33 | 80.70 | 0.81 | 564.34 | true | 0.806996;0.806648;0.804154;0.809718;0.808575 | 20316160;20316160;20316160;20316160;20316160 | 12695456;12613920;12707808;12831968;12709024 | 12423328;12429536;12329504;12370336;12208480 | |
124 | densenet0_stage2_relu21_fwd | Activation | [32,128,28,28] | 720.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 35.67 | 6422528 | 12845408.00 | 12610858.67 | 93.90 | 0.25 | 180.07 | true | 0.938524;0.942247;0.937389;0.937767;0.939263 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12847456;12845408;12845408 | 12605696;12614720;12616960;12612160;12604736 | |
125 | densenet0_stage2_conv21_fwd | Convolution | [32,128,28,28] | 153707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.67 | 985300992 | 29233184.00 | 7698368.00 | 20.90 | 26.68 | 11112.38 | false | 0.214944;0.208178;0.208918;0.208320;0.209066 | 985300992;985300992;985300992;985300992;985300992 | 29234336;29234464;29265568;29229216;29230752 | 7597280;7832800;7665024;7580832;7873184 | |
125 | densenet0_stage2_conv21_fwd | Convolution | [32,128,28,28] | 153707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 65.00 | 48168960 | 12747104.00 | 29202922.67 | 42.90 | 1.15 | 741.06 | true | 0.427360;0.429773;0.430515;0.430999;0.425984 | 48168960;48168960;48168960;48168960;48168960 | 13076000;12739040;12798048;12704224;12587040 | 29248864;29201696;29194880;29212192;29158624 | |
125 | densenet0_stage2_conv21_fwd | Convolution | [32,128,28,28] | 153707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5853418.67 | 3365877.33 | 36.00 | 1.80 | 920.35 | true | 0.358216;0.360110;0.364982;0.360378;0.360778 | 16566272;16566272;16566272;16566272;16566272 | 5735456;5998400;5826400;5670400;6001728 | 3439392;3233504;3424736;3476064;3186144 | |
125 | densenet0_stage2_conv21_fwd | Convolution | [32,128,28,28] | 153707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 374154.67 | 12.30 | 0.64 | 66.36 | true | 0.123330;0.123330;0.123494;0.123703;0.123595 | 331776;331776;331776;331776;331776 | 147648;147648;147648;147648;147648 | 381408;375200;365664;365856;391328 | |
126 | densenet0_stage2_concat10 | Concat | [32,448,28,28] | 3818.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 75.57 | 0 | 24084640.00 | 24158981.33 | 82.30 | 0.00 | 0.00 | true | 0.862419;0.783288;0.862681;0.777691;0.863307;0.782762;0.863060;0.785474;0.862073;0.779158 | 0;0;0;0;0;0;0;0;0;0 | 44958464;3349280;44994976;3342240;44964576;3342048;44988512;3345568;45012704;3347488 | 44957856;3211424;44957856;3211424;44957856;3211424;44957856;3211424;44957856;3211424 | |
126 | densenet0_stage2_concat10 | Concat | [32,448,28,28] | 3818.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 59.86 | 0 | 24084640.00 | 24158981.33 | 82.30 | 0.00 | 0.00 | true | 0.862419;0.783288;0.862681;0.777691;0.863307;0.782762;0.863060;0.785474;0.862073;0.779158 | 0;0;0;0;0;0;0;0;0;0 | 44958464;3349280;44994976;3342240;44964576;3342048;44988512;3345568;45012704;3347488 | 44957856;3211424;44957856;3211424;44957856;3211424;44957856;3211424;44957856;3211424 | |
127 | densenet0_stage2_batchnorm22_fwd | BatchNorm | [32,480,28,28] | 4056.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 125.67 | 76185600 | 48142602.67 | 48075808.00 | 83.80 | 0.79 | 606.25 | true | 0.838296;0.837686;0.836689;0.837534;0.837989 | 76185600;76185600;76185600;76185600;76185600 | 48082656;48084832;48070144;48074464;48070304 | 48140768;48143968;48143712;48139232;48143328 | |
128 | densenet0_stage2_relu22_fwd | Activation | [32,480,28,28] | 3306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 127.67 | 24084480 | 48169312.00 | 47932864.00 | 95.50 | 0.25 | 188.65 | true | 0.955027;0.957040;0.954168;0.954844;0.955984 | 24084480;24084480;24084480;24084480;24084480 | 47922176;47929088;47938368;47932160;47937344 | 48169312;48169312;48169312;48169312;48169312 | |
129 | densenet0_stage2_conv22_fwd | Convolution | [32,480,28,28] | 174547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 300.00 | 3089235968 | 92069333.33 | 13971146.67 | 21.80 | 29.13 | 10297.45 | false | 0.219438;0.218684;0.218950;0.216951;0.214569 | 3089235968;3089235968;3089235968;3089235968;3089235968 | 92680256;90880896;90852928;92692480;92646848 | 13931840;13859840;13955520;14026080;14041952 | |
129 | densenet0_stage2_conv22_fwd | Convolution | [32,480,28,28] | 174547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059371;0.059388;0.059366;0.059391;0.059366 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7424;7424;7424 | |
130 | densenet0_stage2_batchnorm23_fwd | BatchNorm | [32,128,28,28] | 1117 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 20316160 | 12847328.00 | 11939445.33 | 80.80 | 0.82 | 564.34 | true | 0.808548;0.808014;0.805957;0.806724;0.809812 | 20316160;20316160;20316160;20316160;20316160 | 12847328;12847328;12847328;12847456;12847328 | 11967392;12055200;11959200;11891744;11880288 | |
131 | densenet0_stage2_relu23_fwd | Activation | [32,128,28,28] | 911.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12616426.67 | 94.10 | 0.25 | 178.40 | true | 0.942750;0.940763;0.938779;0.941549;0.941416 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845664 | 12628672;12619008;12617920;12612352;12607744 | |
132 | densenet0_stage2_conv23_fwd | Convolution | [32,128,28,28] | 153553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 88.33 | 985300992 | 29268810.67 | 7622901.33 | 20.70 | 26.71 | 11154.39 | false | 0.209290;0.206275;0.206185;0.206497;0.212909 | 985300992;985300992;985300992;985300992;985300992 | 29237408;29284768;29287264;29279008;29242656 | 7476192;7643296;7684608;7540800;7709856 | |
132 | densenet0_stage2_conv23_fwd | Convolution | [32,128,28,28] | 153553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 62.67 | 48168960 | 12772874.67 | 29217813.33 | 42.90 | 1.15 | 768.65 | true | 0.430258;0.428945;0.425385;0.427538;0.429298 | 48168960;48168960;48168960;48168960;48168960 | 13002272;12515808;12684512;12631840;13248160 | 29227808;29225216;29162016;29205248;29222976 | |
132 | densenet0_stage2_conv23_fwd | Convolution | [32,128,28,28] | 153553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 18.00 | 16566272 | 5626474.67 | 3408736.00 | 36.10 | 1.83 | 920.35 | true | 0.359434;0.359644;0.363193;0.360365;0.362951 | 16566272;16566272;16566272;16566272;16566272 | 5605920;5588480;5717280;5543840;5685024 | 3467104;3357344;3395904;3438624;3391680 | |
132 | densenet0_stage2_conv23_fwd | Convolution | [32,128,28,28] | 153553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 331776 | 147648.00 | 389088.00 | 12.30 | 0.62 | 66.36 | true | 0.123645;0.123541;0.123393;0.123445;0.123397 | 331776;331776;331776;331776;331776 | 147648;147648;147648;153536;147648 | 398688;390240;389792;387232;364768 | |
133 | densenet0_stage2_concat11 | Concat | [32,480,28,28] | 4187 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 80.86 | 0 | 25690517.33 | 25736272.00 | 82.30 | 0.00 | 0.00 | true | 0.863537;0.777966;0.863738;0.774765;0.863012;0.779740;0.863418;0.783922;0.864271;0.781546 | 0;0;0;0;0;0;0;0;0;0 | 48169120;3211424;48169120;3212768;48169120;3211424;48169120;3211488;48169120;3211488 | 48237408;3263552;48217120;3245728;48231872;3256416;48265568;3264416;48184256;3243936 | |
133 | densenet0_stage2_concat11 | Concat | [32,480,28,28] | 4187 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 63.86 | 0 | 25690517.33 | 25736272.00 | 82.30 | 0.00 | 0.00 | true | 0.863537;0.777966;0.863738;0.774765;0.863012;0.779740;0.863418;0.783922;0.864271;0.781546 | 0;0;0;0;0;0;0;0;0;0 | 48169120;3211424;48169120;3212768;48169120;3211424;48169120;3211488;48169120;3211488 | 48237408;3263552;48217120;3245728;48231872;3256416;48265568;3264416;48184256;3243936 | |
134 | densenet0_batchnorm2_fwd | BatchNorm | [32,512,28,28] | 4220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 135.00 | 81264640 | 51389258.67 | 51362933.33 | 83.80 | 0.79 | 601.96 | true | 0.837882;0.838556;0.838714;0.837622;0.838881 | 81264640;81264640;81264640;81264640;81264640 | 51359136;51365472;51364192;51351008;51366368 | 51389664;51388832;51389280;51388448;51390240 | |
135 | densenet0_relu2_fwd | Activation | [32,512,28,28] | 3547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51149674.67 | 95.70 | 0.25 | 188.44 | true | 0.957806;0.956388;0.956872;0.957848;0.957344 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51149632;51140864;51147008;51152384;51153152 | |
136 | densenet0_conv2_fwd | Convolution | [32,512,28,28] | 359292.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 641.67 | 6589513728 | 0.00 | 3445.33 | 23.80 | 1912591.24 | 10269.37 | false | 0.239526;0.238945;0.236753;0.236957;0.240117 | 6589513728;6589513728;6589513728;6589513728;6589513728 | 0;64;0;0;0 | 3488;3360;3488;3360;3488 | |
136 | densenet0_conv2_fwd | Convolution | [32,512,28,28] | 359292.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7424.00 | 5.90 | 0.00 | 0.00 | true | 0.059377;0.059396;0.059473;0.059597;0.059365 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7424;7424;19712 | |
137 | densenet0_pool2_fwd | Pooling | [32,256,28,28] | 17899.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 56.00 | 33718272 | 25669162.67 | 8092266.67 | 51.50 | 1.00 | 602.11 | true | 0.513345;0.516822;0.516120;0.514203;0.516061 | 33718272;33718272;33718272;33718272;33718272 | 25674432;25663744;25675840;25669312;25618304 | 8232992;7967488;8066688;8153344;8056768 | |
138 | densenet0_stage3_batchnorm0_fwd | BatchNorm | [32,256,14,14] | 1172.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 21.00 | 10158080 | 6434666.67 | 5567392.00 | 68.20 | 0.85 | 483.72 | true | 0.677192;0.672220;0.695862;0.679390;0.690739 | 10158080;10158080;10158080;10158080;10158080 | 6430656;6435744;6437504;6434400;6433856 | 5597792;5569152;5567744;5521344;5565280 | |
139 | densenet0_stage3_relu0_fwd | Activation | [32,256,14,14] | 483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 18.33 | 3211264 | 3915018.67 | 6403328.00 | 90.60 | 0.31 | 175.16 | true | 0.906024;0.905446;0.905308;0.905144;0.908619 | 3211264;3211264;3211264;3211264;3211264 | 3913952;3909344;3925216;3914976;3916128 | 6402656;6407808;6385952;6403968;6403360 | |
140 | densenet0_stage3_conv0_fwd | Convolution | [32,256,14,14] | 24629.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 63.33 | 412647424 | 747157.33 | 3560874.67 | 8.10 | 95.79 | 6515.52 | false | 0.080799;0.081267;0.080885;0.081130;0.080748 | 412647424;412647424;412647424;412647424;412647424 | 3563296;3552640;3564256;3557696;3561632 | 746496;747840;719104;773920;747136 | |
140 | densenet0_stage3_conv0_fwd | Convolution | [32,256,14,14] | 24629.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 2688.00 | 5.70 | 0.00 | 0.00 | true | 0.058078;0.055412;0.057962;0.058070;0.055221 | 0;0;0;0;0 | 352;352;352;352;352 | 2560;2688;2816;2688;2688 | |
141 | densenet0_stage3_batchnorm1_fwd | BatchNorm | [32,128,14,14] | 759 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 420522.67 | 3488298.67 | 36.50 | 1.30 | 423.25 | true | 0.365484;0.366062;0.364845;0.359563;0.364109 | 5079040;5079040;5079040;5079040;5079040 | 426496;398272;423008;423360;415200 | 3484448;3494976;3475712;3498272;3485472 | |
142 | densenet0_stage3_relu1_fwd | Activation | [32,128,14,14] | 292.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 23072.00 | 1642709.33 | 77.00 | 0.96 | 229.38 | true | 0.768962;0.773132;0.767449;0.778366;0.768558 | 1605632;1605632;1605632;1605632;1605632 | 22944;22816;23456;21792;23712 | 1589152;1650400;1626208;1656928;1651520 | |
143 | densenet0_stage3_conv1_fwd | Convolution | [32,128,14,14] | 39734.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.67 | 281673728 | 9813.33 | 1102954.67 | 12.50 | 253.13 | 7478.00 | false | 0.124809;0.124796;0.124802;0.124806;0.124806 | 281673728;281673728;281673728;281673728;281673728 | 1093504;1109888;1106880;1093280;1108480 | 8960;10240;9984;9728;9728 | |
143 | densenet0_stage3_conv1_fwd | Convolution | [32,128,14,14] | 39734.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.33 | 237568 | 150720.00 | 189962.67 | 6.20 | 0.70 | 44.55 | true | 0.062417;0.062428;0.062428;0.062423;0.062423 | 237568;237568;237568;237568;237568 | 150720;150720;151488;150720;150464 | 214016;186528;182144;196640;186720 | |
144 | densenet0_stage3_concat0 | Concat | [32,256,14,14] | 540.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.29 | 0 | 3611280.00 | 3802917.33 | 72.90 | 0.00 | 0.00 | true | 0.822388;0.619284;0.828065;0.634255;0.829390;0.633358;0.828420;0.627389;0.825651;0.624126 | 0;0;0;0;0;0;0;0;0;0 | 6714016;901280;6708960;895232;6710816;904800;6702016;892928;6704064;896384 | 6422848;800288;6422848;798848;6422848;800000;6422848;798688;6422848;798560 | |
144 | densenet0_stage3_concat0 | Concat | [32,256,14,14] | 540.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.00 | 0 | 3611280.00 | 3802917.33 | 72.90 | 0.00 | 0.00 | true | 0.822388;0.619284;0.828065;0.634255;0.829390;0.633358;0.828420;0.627389;0.825651;0.624126 | 0;0;0;0;0;0;0;0;0;0 | 6422848;800288;6422848;798848;6422848;800000;6422848;798688;6422848;798560 | 6714016;901280;6708960;895232;6710816;904800;6702016;892928;6704064;896384 | |
145 | densenet0_stage3_batchnorm2_fwd | BatchNorm | [32,288,14,14] | 969.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 24.00 | 11427840 | 6872298.67 | 7760213.33 | 74.30 | 0.78 | 476.16 | true | 0.744877;0.743268;0.739670;0.741951;0.750560 | 11427840;11427840;11427840;11427840;11427840 | 6874912;6869984;6872000;6867008;6884800 | 7759200;7765248;7749088;7764352;7757088 | |
146 | densenet0_stage3_relu2_fwd | Activation | [32,288,14,14] | 556.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 20.00 | 3612672 | 5269770.67 | 7209824.00 | 91.00 | 0.29 | 180.63 | true | 0.913588;0.905962;0.908671;0.911588;0.908250 | 3612672;3612672;3612672;3612672;3612672 | 5272544;5264352;5275232;5272416;5264224 | 7211616;7208736;7209120;7206784;7212288 | |
147 | densenet0_stage3_conv2_fwd | Convolution | [32,288,14,14] | 26352 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 70.00 | 464027648 | 3126186.67 | 3548768.00 | 8.10 | 69.52 | 6628.97 | false | 0.080319;0.081085;0.080690;0.080955;0.081089 | 464027648;464027648;464027648;464027648;464027648 | 3148064;3098912;3113312;3168896;3117184 | 3539456;3550144;3556704;3534368;3557312 | |
147 | densenet0_stage3_conv2_fwd | Convolution | [32,288,14,14] | 26352 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058174;0.058037;0.057940;0.058077;0.057914 | 0;0;0;0;0 | 96;96;96;96;96 | 2944;2432;2432;16768;2432 | |
148 | densenet0_stage3_batchnorm3_fwd | BatchNorm | [32,128,14,14] | 403 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 480149.33 | 3486261.33 | 37.10 | 1.28 | 423.25 | true | 0.372907;0.368284;0.368357;0.382736;0.371053 | 5079040;5079040;5079040;5079040;5079040 | 469536;467584;508640;489152;481760 | 3483136;3486208;3486240;3488288;3486336 | |
149 | densenet0_stage3_relu3_fwd | Activation | [32,128,14,14] | 260 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 22133.33 | 1624896.00 | 77.90 | 0.97 | 229.38 | true | 0.779847;0.778699;0.772793;0.778062;0.780831 | 1605632;1605632;1605632;1605632;1605632 | 25248;20128;19232;21024;26272 | 1633504;1622816;1644224;1604608;1618368 | |
150 | densenet0_stage3_conv3_fwd | Convolution | [32,128,14,14] | 39582.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 85.33 | 1104906.67 | 12.50 | 254.91 | 7824.27 | false | 0.124825;0.124828;0.124835;0.124817;0.124828 | 281673728;281673728;281673728;281673728;281673728 | 256;0;256;0;0 | 1105696;1104640;1108160;1093568;1104384 | |
150 | densenet0_stage3_conv3_fwd | Convolution | [32,128,14,14] | 39582.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 179914.67 | 6.20 | 0.73 | 47.51 | true | 0.062359;0.062357;0.062352;0.062353;0.062360 | 237568;237568;237568;237568;237568 | 147648;147648;147648;149696;147648 | 180896;180864;181024;164320;177984 | |
151 | densenet0_stage3_concat1 | Concat | [32,288,14,14] | 483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.57 | 0 | 4014208.00 | 4201802.67 | 73.20 | 0.00 | 0.00 | true | 0.830509;0.626049;0.833626;0.640514;0.838766;0.620041;0.831401;0.627022;0.836020;0.616704 | 0;0;0;0;0;0;0;0;0;0 | 7225408;803008;7225408;802848;7225408;802976;7225408;803008;7228224;803008 | 7502496;903840;7503968;896832;7501472;898208;7507040;900832;7509056;896256 | |
151 | densenet0_stage3_concat1 | Concat | [32,288,14,14] | 483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.00 | 0 | 4014208.00 | 4201802.67 | 73.20 | 0.00 | 0.00 | true | 0.830509;0.626049;0.833626;0.640514;0.838766;0.620041;0.831401;0.627022;0.836020;0.616704 | 0;0;0;0;0;0;0;0;0;0 | 7225408;803008;7225408;802848;7225408;802976;7225408;803008;7228224;803008 | 7502496;903840;7503968;896832;7501472;898208;7507040;900832;7509056;896256 | |
152 | densenet0_stage3_batchnorm4_fwd | BatchNorm | [32,320,14,14] | 1435.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26.00 | 12697600 | 7729098.67 | 8646581.33 | 75.00 | 0.78 | 488.37 | true | 0.751413;0.748798;0.749281;0.748470;0.760289 | 12697600;12697600;12697600;12697600;12697600 | 7726464;7728736;7727840;7731936;7730720 | 8648448;8652736;8643328;8646240;8645056 | |
153 | densenet0_stage3_relu4_fwd | Activation | [32,320,14,14] | 577 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 23.00 | 4014080 | 6444896.00 | 8023210.67 | 92.00 | 0.28 | 174.53 | true | 0.920251;0.919858;0.918992;0.922322;0.920131 | 4014080;4014080;4014080;4014080;4014080 | 6448480;6444128;6438272;6451808;6442080 | 8024128;8022048;8024000;8023584;8021536 | |
154 | densenet0_stage3_conv4_fwd | Convolution | [32,320,14,14] | 31282.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 76.00 | 515407872 | 4904757.33 | 3547178.67 | 8.10 | 60.98 | 6781.68 | false | 0.080882;0.080411;0.081269;0.080864;0.080678 | 515407872;515407872;515407872;515407872;515407872 | 3547488;3534432;3558848;3535200;3564960 | 4886368;4882784;4945120;4830272;4962720 | |
154 | densenet0_stage3_conv4_fwd | Convolution | [32,320,14,14] | 31282.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058090;0.058037;0.057937;0.058070;0.057963 | 0;0;0;0;0 | 2432;16512;2432;2432;2560 | 96;6752;96;96;96 | |
155 | densenet0_stage3_batchnorm5_fwd | BatchNorm | [32,128,14,14] | 451 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 537845.33 | 3482112.00 | 39.60 | 1.26 | 423.25 | true | 0.398668;0.384451;0.398258;0.391637;0.400609 | 5079040;5079040;5079040;5079040;5079040 | 523520;540992;549024;496192;565312 | 3477984;3486080;3482272;3491296;3464512 | |
156 | densenet0_stage3_relu5_fwd | Activation | [32,128,14,14] | 264 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 22474.67 | 1607840.00 | 77.50 | 0.98 | 240.83 | true | 0.775062;0.771392;0.773619;0.774904;0.778057 | 1605632;1605632;1605632;1605632;1605632 | 22560;20512;25760;24352;20256 | 1597536;1613280;1612704;1639392;1595072 | |
157 | densenet0_stage3_conv5_fwd | Convolution | [32,128,14,14] | 39704.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 85.33 | 1129376.00 | 12.50 | 249.39 | 7897.32 | false | 0.124827;0.124827;0.124828;0.124828;0.124828 | 281673728;281673728;281673728;281673728;281673728 | 256;0;256;0;0 | 1127648;1129632;1130848;1127648;1131264 | |
157 | densenet0_stage3_conv5_fwd | Convolution | [32,128,14,14] | 39704.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 166005.33 | 6.20 | 0.76 | 50.90 | true | 0.062359;0.062358;0.062362;0.062355;0.062359 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 162752;161024;168128;167136;170304 | |
158 | densenet0_stage3_concat2 | Concat | [32,320,14,14] | 570.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 18.14 | 0 | 4415600.00 | 4626069.33 | 72.90 | 0.00 | 0.00 | true | 0.835885;0.623205;0.834371;0.620259;0.834880;0.610688;0.834204;0.627761;0.836038;0.620105 | 0;0;0;0;0;0;0;0;0;0 | 8028224;802976;8028224;802976;8030272;802976;8028224;802976;8028224;802976 | 8350528;898368;8356352;896128;8362496;897216;8358816;889824;8356480;897472 | |
158 | densenet0_stage3_concat2 | Concat | [32,320,14,14] | 570.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.29 | 0 | 4415600.00 | 4626069.33 | 72.90 | 0.00 | 0.00 | true | 0.835885;0.623205;0.834371;0.620259;0.834880;0.610688;0.834204;0.627761;0.836038;0.620105 | 0;0;0;0;0;0;0;0;0;0 | 8350528;898368;8356352;896128;8362496;897216;8358816;889824;8356480;897472 | 8028224;802976;8028224;802976;8030272;802976;8028224;802976;8028224;802976 | |
159 | densenet0_stage3_batchnorm6_fwd | BatchNorm | [32,352,14,14] | 1199 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 28.00 | 13967360 | 8570613.33 | 9514410.67 | 77.30 | 0.77 | 498.83 | true | 0.772091;0.771225;0.771751;0.777944;0.776645 | 13967360;13967360;13967360;13967360;13967360 | 8570368;8567776;8570080;8571392;8574112 | 9511936;9517280;9498240;9514016;9521696 | |
160 | densenet0_stage3_relu6_fwd | Activation | [32,352,14,14] | 627 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 25.00 | 4415488 | 7587392.00 | 8817866.67 | 93.20 | 0.27 | 176.62 | true | 0.931847;0.934932;0.932301;0.931482;0.928028 | 4415488;4415488;4415488;4415488;4415488 | 7583584;7586784;7590112;7585280;7591648 | 8816640;8817376;8825120;8819584;8804704 | |
161 | densenet0_stage3_conv6_fwd | Convolution | [32,352,14,14] | 33295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 82.33 | 566788096 | 6366805.33 | 3594677.33 | 8.10 | 56.90 | 6884.09 | false | 0.081156;0.081281;0.081116;0.081182;0.081127 | 566788096;566788096;566788096;566788096;566788096 | 6336992;6356384;6371392;6375776;6372640 | 3596864;3579200;3580480;3606688;3609472 | |
161 | densenet0_stage3_conv6_fwd | Convolution | [32,352,14,14] | 33295.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058065;0.058234;0.057912;0.058055;0.057939 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2432;2432 | |
162 | densenet0_stage3_batchnorm7_fwd | BatchNorm | [32,128,14,14] | 486 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 628714.67 | 3432704.00 | 38.40 | 1.25 | 423.25 | true | 0.384103;0.378384;0.384738;0.385642;0.382244 | 5079040;5079040;5079040;5079040;5079040 | 619648;644896;616704;631936;634560 | 3430368;3452384;3448352;3419392;3416224 | |
163 | densenet0_stage3_relu7_fwd | Activation | [32,128,14,14] | 264.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 1605632 | 19360.00 | 1623882.67 | 77.70 | 0.98 | 253.53 | true | 0.777940;0.776236;0.776490;0.780206;0.777037 | 1605632;1605632;1605632;1605632;1605632 | 18976;19744;21280;19360;18976 | 1648768;1631904;1619488;1620256;1611328 | |
164 | densenet0_stage3_conv7_fwd | Convolution | [32,128,14,14] | 39619.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.33 | 281673728 | 0.00 | 1125333.33 | 12.50 | 250.30 | 7971.97 | false | 0.124831;0.124818;0.124831;0.124830;0.124830 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1129376;1124448;1127936;1123616;1120544 | |
164 | densenet0_stage3_conv7_fwd | Convolution | [32,128,14,14] | 39619.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 168074.67 | 6.20 | 0.75 | 47.51 | true | 0.062368;0.062361;0.062364;0.062361;0.062358 | 237568;237568;237568;237568;237568 | 159264;175360;161536;170688;172000 | 147648;147648;147648;147648;147648 | |
165 | densenet0_stage3_concat3 | Concat | [32,352,14,14] | 584 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.57 | 0 | 4817029.33 | 5043706.67 | 73.00 | 0.00 | 0.00 | true | 0.839823;0.620894;0.838998;0.620308;0.836279;0.615918;0.841664;0.617403;0.839525;0.625540 | 0;0;0;0;0;0;0;0;0;0 | 9193504;883936;9202176;883424;9199328;892032;9194816;896768;9207456;885792 | 8831040;803008;8831040;802976;8831040;803008;8831040;803040;8831040;802976 | |
165 | densenet0_stage3_concat3 | Concat | [32,352,14,14] | 584 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.14 | 0 | 4817029.33 | 5043706.67 | 73.00 | 0.00 | 0.00 | true | 0.839823;0.620894;0.838998;0.620308;0.836279;0.615918;0.841664;0.617403;0.839525;0.625540 | 0;0;0;0;0;0;0;0;0;0 | 9193504;883936;9202176;883424;9199328;892032;9194816;896768;9207456;885792 | 8831040;803008;8831040;802976;8831040;803008;8831040;803040;8831040;802976 | |
166 | densenet0_stage3_batchnorm8_fwd | BatchNorm | [32,384,14,14] | 1708.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 31.00 | 15237120 | 9422176.00 | 10397770.67 | 78.40 | 0.77 | 491.52 | true | 0.782122;0.784596;0.780308;0.785046;0.787763 | 15237120;15237120;15237120;15237120;15237120 | 9425312;9417728;9422624;9420512;9423392 | 10402464;10408736;10391232;10399616;10385248 | |
167 | densenet0_stage3_relu8_fwd | Activation | [32,384,14,14] | 680.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 27.33 | 4816896 | 8773397.33 | 9617834.67 | 93.70 | 0.26 | 176.23 | true | 0.939395;0.937082;0.933125;0.941378;0.935320 | 4816896;4816896;4816896;4816896;4816896 | 9616448;9615072;9619680;9617376;9625824 | 8771808;8771680;8770400;8776704;8780384 | |
168 | densenet0_stage3_conv8_fwd | Convolution | [32,384,14,14] | 35112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 88.67 | 618168320 | 7928640.00 | 3603872.00 | 8.10 | 53.60 | 6971.80 | false | 0.081409;0.081298;0.081104;0.081213;0.081025 | 618168320;618168320;618168320;618168320;618168320 | 7933312;7804576;7963968;7913056;7939552 | 3609600;3617920;3580608;3608864;3593152 | |
168 | densenet0_stage3_conv8_fwd | Convolution | [32,384,14,14] | 35112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058080;0.058065;0.057924;0.058067;0.057947 | 0;0;0;0;0 | 96;96;96;352;96 | 2432;2432;2432;13440;2432 | |
169 | densenet0_stage3_batchnorm9_fwd | BatchNorm | [32,128,14,14] | 531.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 693130.67 | 3396842.67 | 40.70 | 1.24 | 423.25 | true | 0.418648;0.408039;0.403800;0.409948;0.398632 | 5079040;5079040;5079040;5079040;5079040 | 705984;699136;678720;701536;674976 | 3402112;3371136;3421568;3383968;3404448 | |
170 | densenet0_stage3_relu9_fwd | Activation | [32,128,14,14] | 263.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 19701.33 | 1622186.67 | 77.30 | 0.98 | 229.38 | true | 0.774434;0.775184;0.771507;0.773117;0.770506 | 1605632;1605632;1605632;1605632;1605632 | 19488;19872;19616;19616;27040 | 1594272;1647936;1612352;1620128;1634080 | |
171 | densenet0_stage3_conv9_fwd | Convolution | [32,128,14,14] | 39650.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 281673728 | 0.00 | 1028000.00 | 12.50 | 274.00 | 8047.82 | false | 0.124818;0.124832;0.124837;0.124838;0.124831 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;2048;0 | 1028352;1008288;1031008;1025536;1030112 | |
171 | densenet0_stage3_conv9_fwd | Convolution | [32,128,14,14] | 39650.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 248650.67 | 6.20 | 0.60 | 47.51 | true | 0.062361;0.062364;0.062373;0.062369;0.062361 | 237568;237568;237568;237568;237568 | 147904;147648;147648;147648;147584 | 246112;272032;248480;251360;244224 | |
172 | densenet0_stage3_concat4 | Concat | [32,384,14,14] | 760.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 20.57 | 0 | 5219328.00 | 5474565.33 | 73.60 | 0.00 | 0.00 | true | 0.841395;0.625543;0.839837;0.622552;0.839744;0.630159;0.840052;0.631017;0.839131;0.634621 | 0;0;0;0;0;0;0;0;0;0 | 9633856;803008;9633856;803008;9633856;802976;9633920;802976;9633856;808384 | 10061024;883360;10060288;883328;10060096;890912;10071392;884864;10057376;893856 | |
172 | densenet0_stage3_concat4 | Concat | [32,384,14,14] | 760.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.00 | 0 | 5219328.00 | 5474565.33 | 73.60 | 0.00 | 0.00 | true | 0.841395;0.625543;0.839837;0.622552;0.839744;0.630159;0.840052;0.631017;0.839131;0.634621 | 0;0;0;0;0;0;0;0;0;0 | 10061024;883360;10060288;883328;10060096;890912;10071392;884864;10057376;893856 | 9633856;803008;9633856;803008;9633856;802976;9633920;802976;9633856;808384 | |
173 | densenet0_stage3_batchnorm10_fwd | BatchNorm | [32,416,14,14] | 1453 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 32.33 | 16506880 | 10256117.33 | 11270069.33 | 79.30 | 0.77 | 510.53 | true | 0.793400;0.796733;0.794798;0.789687;0.790518 | 16506880;16506880;16506880;16506880;16506880 | 10256352;10257600;10257696;10254400;10254208 | 11273216;11276352;11275296;11261696;11259808 | |
174 | densenet0_stage3_relu10_fwd | Activation | [32,416,14,14] | 729.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 30.00 | 5218304 | 10018869.33 | 10419914.67 | 93.00 | 0.26 | 173.94 | true | 0.930217;0.929593;0.927807;0.930249;0.929945 | 5218304;5218304;5218304;5218304;5218304 | 10416736;10424288;10416704;10420160;10422848 | 10010752;10024928;10010848;10028256;10020832 | |
175 | densenet0_stage3_conv10_fwd | Convolution | [32,416,14,14] | 39986.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 95.33 | 669548544 | 9970005.33 | 3759050.67 | 8.20 | 48.77 | 7023.26 | false | 0.081590;0.081448;0.081485;0.081600;0.081464 | 669548544;669548544;669548544;669548544;669548544 | 9874496;10117152;9987616;10028096;9894304 | 3759040;3756352;3749280;3761760;3764736 | |
175 | densenet0_stage3_conv10_fwd | Convolution | [32,416,14,14] | 39986.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058082;0.058103;0.057948;0.058077;0.057919 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2432;2432 | |
176 | densenet0_stage3_batchnorm11_fwd | BatchNorm | [32,128,14,14] | 490.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 787434.67 | 3241184.00 | 41.20 | 1.26 | 423.25 | true | 0.411462;0.413303;0.404286;0.411564;0.413316 | 5079040;5079040;5079040;5079040;5079040 | 3244960;3231648;3253120;3243808;3234784 | 799904;791680;787904;763936;782720 | |
177 | densenet0_stage3_relu11_fwd | Activation | [32,128,14,14] | 268 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 17440.00 | 1611584.00 | 77.80 | 0.99 | 229.38 | true | 0.777299;0.779778;0.775952;0.778152;0.778754 | 1605632;1605632;1605632;1605632;1605632 | 17696;17568;17440;17056;17312 | 1638272;1610656;1614240;1609856;1602336 | |
178 | densenet0_stage3_conv11_fwd | Convolution | [32,128,14,14] | 39698.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1125504.00 | 12.50 | 250.26 | 7824.27 | false | 0.124831;0.124825;0.124828;0.124826;0.124836 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;1792;0 | 1120032;1128224;1122432;1127456;1126624 | |
178 | densenet0_stage3_conv11_fwd | Convolution | [32,128,14,14] | 39698.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 165600.00 | 6.20 | 0.76 | 50.90 | true | 0.062358;0.062359;0.062357;0.062360;0.062363 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 168544;168352;150304;165184;163264 | |
179 | densenet0_stage3_concat5 | Concat | [32,416,14,14] | 661.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 22.00 | 0 | 5620058.67 | 5894576.00 | 73.70 | 0.00 | 0.00 | true | 0.844592;0.619878;0.841095;0.621201;0.847996;0.616162;0.840558;0.632776;0.843707;0.640141 | 0;0;0;0;0;0;0;0;0;0 | 10436672;802976;10436672;803040;10436672;804288;10436672;802976;10436672;803008 | 10887712;901984;10877952;914880;10879776;902016;10872800;907424;10881344;914624 | |
179 | densenet0_stage3_concat5 | Concat | [32,416,14,14] | 661.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 18.00 | 0 | 5620058.67 | 5894576.00 | 73.70 | 0.00 | 0.00 | true | 0.844592;0.619878;0.841095;0.621201;0.847996;0.616162;0.840558;0.632776;0.843707;0.640141 | 0;0;0;0;0;0;0;0;0;0 | 10436672;802976;10436672;803040;10436672;804288;10436672;802976;10436672;803008 | 10887712;901984;10877952;914880;10879776;902016;10872800;907424;10881344;914624 | |
180 | densenet0_stage3_batchnorm12_fwd | BatchNorm | [32,448,14,14] | 2271.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 35.00 | 17776640 | 11095861.33 | 12117184.00 | 80.10 | 0.77 | 507.90 | true | 0.799100;0.801357;0.802493;0.802041;0.800565 | 17776640;17776640;17776640;17776640;17776640 | 11094944;11096928;11095712;11092192;11098144 | 12119584;12112224;12119744;12121120;12106752 | |
181 | densenet0_stage3_relu12_fwd | Activation | [32,448,14,14] | 789 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 32.67 | 5619712 | 11094250.67 | 11230634.67 | 92.70 | 0.25 | 172.03 | true | 0.928787;0.925429;0.926467;0.928220;0.927377 | 5619712;5619712;5619712;5619712;5619712 | 11090048;11098720;11093984;11086688;11098720 | 11230432;11232256;11230592;11230880;11226848 | |
182 | densenet0_stage3_conv12_fwd | Convolution | [32,448,14,14] | 42220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 100.67 | 720928768 | 11594794.67 | 3986538.67 | 8.20 | 46.27 | 7161.52 | false | 0.081763;0.081712;0.081840;0.081833;0.081728 | 720928768;720928768;720928768;720928768;720928768 | 3951904;3953152;4007136;4020576;3999328 | 11557536;11447328;11600768;11626080;11698368 | |
182 | densenet0_stage3_conv12_fwd | Convolution | [32,448,14,14] | 42220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058094;0.058075;0.057951;0.058084;0.057916 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;4992;2432 | |
183 | densenet0_stage3_batchnorm13_fwd | BatchNorm | [32,128,14,14] | 585.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 904800.00 | 3002666.67 | 42.40 | 1.30 | 423.25 | true | 0.422398;0.429103;0.413877;0.419889;0.429967 | 5079040;5079040;5079040;5079040;5079040 | 3031520;3048192;2986368;2957696;2990112 | 890272;901728;908704;918336;903968 | |
184 | densenet0_stage3_relu13_fwd | Activation | [32,128,14,14] | 267 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 18336.00 | 1607584.00 | 77.00 | 0.99 | 240.83 | true | 0.772729;0.770103;0.767693;0.768662;0.770891 | 1605632;1605632;1605632;1605632;1605632 | 17184;19104;16672;18720;19360 | 1599488;1616224;1608640;1614624;1577632 | |
185 | densenet0_stage3_conv13_fwd | Convolution | [32,128,14,14] | 39669.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1029514.67 | 12.50 | 273.60 | 7824.27 | false | 0.124833;0.124833;0.124832;0.124833;0.124833 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1000096;1028768;1029056;1031008;1030720 | |
185 | densenet0_stage3_conv13_fwd | Convolution | [32,128,14,14] | 39669.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 252629.33 | 6.20 | 0.59 | 47.51 | true | 0.062358;0.062358;0.062365;0.062369;0.062359 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 251968;254784;249920;251136;258560 | |
186 | densenet0_stage3_concat6 | Concat | [32,448,14,14] | 898.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 23.57 | 0 | 6021632.00 | 6311120.00 | 73.20 | 0.00 | 0.00 | true | 0.844643;0.621305;0.843226;0.616512;0.844119;0.619248;0.843486;0.621791;0.845570;0.616633 | 0;0;0;0;0;0;0;0;0;0 | 11239488;803008;11239488;803040;11239488;805280;11239488;802976;11239488;802976 | 11727392;897824;11725696;897376;11720128;902464;11722976;899680;11723648;897184 | |
186 | densenet0_stage3_concat6 | Concat | [32,448,14,14] | 898.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.14 | 0 | 6021632.00 | 6311120.00 | 73.20 | 0.00 | 0.00 | true | 0.844643;0.621305;0.843226;0.616512;0.844119;0.619248;0.843486;0.621791;0.845570;0.616633 | 0;0;0;0;0;0;0;0;0;0 | 11239488;803008;11239488;803040;11239488;805280;11239488;802976;11239488;802976 | 11727392;897824;11725696;897376;11720128;902464;11722976;899680;11723648;897184 | |
187 | densenet0_stage3_batchnorm14_fwd | BatchNorm | [32,480,14,14] | 1709.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.33 | 19046400 | 11916522.67 | 13005162.67 | 80.10 | 0.76 | 524.22 | true | 0.800600;0.796699;0.800853;0.802741;0.800604 | 19046400;19046400;19046400;19046400;19046400 | 11916672;11916736;11916160;11915136;11917504 | 12997088;13017728;13005344;13002912;13007232 | |
188 | densenet0_stage3_relu14_fwd | Activation | [32,480,14,14] | 835 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 34.33 | 6021120 | 12012725.33 | 12027637.33 | 93.60 | 0.25 | 175.37 | true | 0.936615;0.935230;0.935854;0.935639;0.935201 | 6021120;6021120;6021120;6021120;6021120 | 12008160;12000992;12016736;12013280;12017120 | 12030976;12015424;12024384;12027552;12039520 | |
189 | densenet0_stage3_conv14_fwd | Convolution | [32,480,14,14] | 43966.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 106.00 | 772308992 | 12990410.67 | 3870848.00 | 8.20 | 45.80 | 7285.93 | false | 0.081998;0.081988;0.081985;0.081929;0.081941 | 772308992;772308992;772308992;772308992;772308992 | 3835584;3911872;3878528;3898432;3810208 | 13035552;13057664;12925344;13010336;12858912 | |
189 | densenet0_stage3_conv14_fwd | Convolution | [32,480,14,14] | 43966.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058126;0.058036;0.057914;0.058082;0.057951 | 0;0;0;0;0 | 3072;2560;2560;2432;2432 | 352;96;96;96;96 | |
190 | densenet0_stage3_batchnorm15_fwd | BatchNorm | [32,128,14,14] | 605 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 928512.00 | 3115712.00 | 42.50 | 1.26 | 423.25 | true | 0.420304;0.429846;0.426839;0.418181;0.428535 | 5079040;5079040;5079040;5079040;5079040 | 908256;934368;949664;942912;902720 | 3153888;3065152;3114656;3078592;3171840 | |
191 | densenet0_stage3_relu15_fwd | Activation | [32,128,14,14] | 264.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 18080.00 | 1611626.67 | 77.80 | 0.99 | 229.38 | true | 0.782246;0.776318;0.778412;0.778034;0.778350 | 1605632;1605632;1605632;1605632;1605632 | 17952;19360;16928;16416;23200 | 1601056;1607104;1616480;1615232;1612544 | |
192 | densenet0_stage3_conv15_fwd | Convolution | [32,128,14,14] | 39493.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1118549.33 | 12.50 | 251.82 | 7897.32 | false | 0.124827;0.124833;0.124823;0.124822;0.124829 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1116320;1119744;1117792;1118112;1123264 | |
192 | densenet0_stage3_conv15_fwd | Convolution | [32,128,14,14] | 39493.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 160757.33 | 6.20 | 0.77 | 50.90 | true | 0.062357;0.062357;0.062357;0.062356;0.062360 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 174976;162432;159136;160704;154592 | |
193 | densenet0_stage3_concat7 | Concat | [32,480,14,14] | 837.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 24.43 | 0 | 6422677.33 | 6734496.00 | 73.60 | 0.00 | 0.00 | true | 0.847628;0.624782;0.846383;0.616118;0.847307;0.620565;0.847330;0.622959;0.849233;0.628760 | 0;0;0;0;0;0;0;0;0;0 | 12042304;803072;12043328;803040;12042304;803040;12042304;803040;12042304;803008 | 12551904;914016;12557184;909248;12552192;917056;12551904;917984;12555328;915936 | |
193 | densenet0_stage3_concat7 | Concat | [32,480,14,14] | 837.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.71 | 0 | 6422677.33 | 6734496.00 | 73.60 | 0.00 | 0.00 | true | 0.847628;0.624782;0.846383;0.616118;0.847307;0.620565;0.847330;0.622959;0.849233;0.628760 | 0;0;0;0;0;0;0;0;0;0 | 12042304;803072;12043328;803040;12042304;803040;12042304;803040;12042304;803008 | 12551904;914016;12557184;909248;12552192;917056;12551904;917984;12555328;915936 | |
194 | densenet0_stage3_batchnorm16_fwd | BatchNorm | [32,512,14,14] | 2040 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 39.00 | 20316160 | 12737120.00 | 13862154.67 | 81.20 | 0.76 | 520.93 | true | 0.811257;0.811818;0.811051;0.813223;0.812745 | 20316160;20316160;20316160;20316160;20316160 | 12738368;12732256;12736928;12736064;12741024 | 13864992;13865312;13862880;13858592;13856448 | |
195 | densenet0_stage3_relu16_fwd | Activation | [32,512,14,14] | 915.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.33 | 6422528 | 12841109.33 | 12831392.00 | 94.00 | 0.25 | 176.77 | true | 0.938291;0.941111;0.941622;0.942962;0.937331 | 6422528;6422528;6422528;6422528;6422528 | 12842080;12841312;12841856;12840160;12839936 | 12830048;12829344;12847520;12832160;12831968 | |
196 | densenet0_stage3_conv16_fwd | Convolution | [32,512,14,14] | 48837 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 112.33 | 823689216 | 14642186.67 | 3976778.67 | 8.20 | 44.24 | 7332.57 | false | 0.082165;0.082211;0.082160;0.082191;0.082216 | 823689216;823689216;823689216;823689216;823689216 | 4009632;3951616;3923136;3969088;4014208 | 14617568;14683328;14559104;14662240;14646752 | |
196 | densenet0_stage3_conv16_fwd | Convolution | [32,512,14,14] | 48837 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058073;0.058056;0.057914;0.058084;0.057958 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2560;2432;2560 | |
197 | densenet0_stage3_batchnorm17_fwd | BatchNorm | [32,128,14,14] | 687.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1035242.67 | 2996736.00 | 44.40 | 1.26 | 423.25 | true | 0.430598;0.455029;0.453940;0.432554;0.444995 | 5079040;5079040;5079040;5079040;5079040 | 1034752;1023968;1026496;1044480;1052800 | 2959968;3030560;3044096;2999680;2959680 | |
198 | densenet0_stage3_relu17_fwd | Activation | [32,128,14,14] | 268.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 17056.00 | 1602528.00 | 76.80 | 0.99 | 229.38 | true | 0.765932;0.770541;0.762928;0.771298;0.768997 | 1605632;1605632;1605632;1605632;1605632 | 17568;16672;18848;16928;16544 | 1601408;1625024;1599424;1606752;1577056 | |
199 | densenet0_stage3_conv17_fwd | Convolution | [32,128,14,14] | 39720.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.33 | 281673728 | 0.00 | 1122112.00 | 12.50 | 251.02 | 7971.97 | false | 0.124819;0.124831;0.124825;0.124837;0.124832 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1123840;1126848;1121952;1118304;1120544 | |
199 | densenet0_stage3_conv17_fwd | Convolution | [32,128,14,14] | 39720.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 169440.00 | 6.20 | 0.75 | 47.51 | true | 0.062351;0.062356;0.062361;0.062356;0.062352 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 169632;162656;169824;168960;169728 | |
200 | densenet0_stage3_concat8 | Concat | [32,512,14,14] | 1062.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 26.00 | 0 | 6824069.33 | 7152074.67 | 73.40 | 0.00 | 0.00 | true | 0.849084;0.615906;0.846102;0.612320;0.846607;0.617359;0.847499;0.628839;0.845439;0.621844 | 0;0;0;0;0;0;0;0;0;0 | 12845120;803008;12845120;802976;12845120;803040;12845120;802976;12845120;803008 | 13400352;900864;13405792;892608;13402208;891968;13400256;905536;13404256;903232 | |
200 | densenet0_stage3_concat8 | Concat | [32,512,14,14] | 1062.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 21.00 | 0 | 6824069.33 | 7152074.67 | 73.40 | 0.00 | 0.00 | true | 0.849084;0.615906;0.846102;0.612320;0.846607;0.617359;0.847499;0.628839;0.845439;0.621844 | 0;0;0;0;0;0;0;0;0;0 | 12845120;803008;12845120;802976;12845120;803040;12845120;802976;12845120;803008 | 13400352;900864;13405792;892608;13402208;891968;13400256;905536;13404256;903232 | |
201 | densenet0_stage3_batchnorm18_fwd | BatchNorm | [32,544,14,14] | 2021.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 41.67 | 21585920 | 13554666.67 | 14746581.33 | 81.60 | 0.76 | 518.06 | true | 0.814665;0.815594;0.815869;0.817532;0.819158 | 21585920;21585920;21585920;21585920;21585920 | 13552672;13558464;13554560;13556768;13552576 | 14732896;14754240;14760704;14745472;14740032 | |
202 | densenet0_stage3_relu18_fwd | Activation | [32,544,14,14] | 939.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 38.33 | 6823936 | 13647989.33 | 13629824.00 | 93.90 | 0.25 | 178.02 | true | 0.943464;0.935802;0.938068;0.939164;0.941046 | 6823936;6823936;6823936;6823936;6823936 | 13647840;13650912;13648032;13647584;13648096 | 13628608;13634496;13628160;13623616;13632704 | |
203 | densenet0_stage3_conv18_fwd | Convolution | [32,544,14,14] | 50865 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 119.00 | 875069440 | 16513845.33 | 4185760.00 | 8.20 | 42.27 | 7353.52 | false | 0.082137;0.082196;0.082065;0.082098;0.082123 | 875069440;875069440;875069440;875069440;875069440 | 16484352;16568672;16588032;16488512;16344192 | 4174240;4186720;4154976;4196320;4252672 | |
203 | densenet0_stage3_conv18_fwd | Convolution | [32,544,14,14] | 50865 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058104;0.058036;0.057958;0.058048;0.057900 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2432;2560 | |
204 | densenet0_stage3_batchnorm19_fwd | BatchNorm | [32,128,14,14] | 640 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1179573.33 | 2774784.00 | 44.10 | 1.28 | 423.25 | true | 0.439557;0.438223;0.439128;0.443336;0.449723 | 5079040;5079040;5079040;5079040;5079040 | 1189600;1194816;1150112;1172960;1176160 | 2783552;2770592;2813472;2770208;2711616 | |
205 | densenet0_stage3_relu19_fwd | Activation | [32,128,14,14] | 257.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1605632 | 16586.67 | 1625834.67 | 77.90 | 0.98 | 267.61 | true | 0.776096;0.777309;0.779293;0.780576;0.780475 | 1605632;1605632;1605632;1605632;1605632 | 1646592;1630656;1618496;1606528;1628352 | 15520;15392;19360;16672;17568 | |
206 | densenet0_stage3_conv19_fwd | Convolution | [32,128,14,14] | 39598.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 281673728 | 0.00 | 1120842.67 | 12.50 | 251.31 | 8047.82 | false | 0.124825;0.124838;0.124836;0.124834;0.124833 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;6656 | 1119552;1110272;1128128;1119712;1123264 | |
206 | densenet0_stage3_conv19_fwd | Convolution | [32,128,14,14] | 39598.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 164672.00 | 6.20 | 0.76 | 47.51 | true | 0.062358;0.062364;0.062356;0.062355;0.062363 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147904 | 167232;165120;161664;170080;160928 | |
207 | densenet0_stage3_concat9 | Concat | [32,544,14,14] | 935.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 28.14 | 0 | 7225610.67 | 7581760.00 | 73.80 | 0.00 | 0.00 | true | 0.847980;0.624643;0.849391;0.616337;0.848811;0.606515;0.848216;0.627392;0.850053;0.633911 | 0;0;0;0;0;0;0;0;0;0 | 13648192;803040;13647936;803392;13647936;803040;13647936;803328;13647936;803136 | 14215008;947168;14219808;956672;14215072;936992;14204768;948160;14220192;950880 | |
207 | densenet0_stage3_concat9 | Concat | [32,544,14,14] | 935.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 22.71 | 0 | 7225610.67 | 7581760.00 | 73.80 | 0.00 | 0.00 | true | 0.847980;0.624643;0.849391;0.616337;0.848811;0.606515;0.848216;0.627392;0.850053;0.633911 | 0;0;0;0;0;0;0;0;0;0 | 14215008;947168;14219808;956672;14215072;936992;14204768;948160;14220192;950880 | 13648192;803040;13647936;803392;13647936;803040;13647936;803328;13647936;803136 | |
208 | densenet0_stage3_batchnorm20_fwd | BatchNorm | [32,576,14,14] | 3262.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 44.33 | 22855680 | 14396885.33 | 15594762.67 | 82.10 | 0.76 | 515.55 | true | 0.822657;0.820576;0.824373;0.820255;0.820527 | 22855680;22855680;22855680;22855680;22855680 | 14394624;14396448;14401504;14395360;14398848 | 15596064;15586368;15607296;15601856;15583648 | |
209 | densenet0_stage3_relu20_fwd | Activation | [32,576,14,14] | 1007.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 40.67 | 7225344 | 14451040.00 | 14431029.33 | 93.50 | 0.25 | 177.67 | true | 0.930618;0.934253;0.934122;0.936491;0.936354 | 7225344;7225344;7225344;7225344;7225344 | 14451040;14451040;14450912;14451040;14457312 | 14429024;14423104;14435360;14428704;14446496 | |
210 | densenet0_stage3_conv20_fwd | Convolution | [32,576,14,14] | 53612.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 125.33 | 926449664 | 18352106.67 | 4233589.33 | 8.20 | 41.02 | 7391.91 | false | 0.081832;0.081936;0.081895;0.081959;0.081991 | 926449664;926449664;926449664;926449664;926449664 | 4228160;4240384;4253600;4232224;4226176 | 18339232;18419520;18315072;18202848;18402016 | |
210 | densenet0_stage3_conv20_fwd | Convolution | [32,576,14,14] | 53612.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058120;0.058053;0.057953;0.058114;0.057931 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
211 | densenet0_stage3_batchnorm21_fwd | BatchNorm | [32,128,14,14] | 679.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1186208.00 | 2728789.33 | 43.30 | 1.30 | 423.25 | true | 0.432437;0.428668;0.428830;0.446209;0.436833 | 5079040;5079040;5079040;5079040;5079040 | 1200416;1206112;1183360;1174848;1163680 | 2729120;2727072;2709408;2734560;2730176 | |
212 | densenet0_stage3_relu21_fwd | Activation | [32,128,14,14] | 262.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 15690.67 | 1589920.00 | 76.90 | 1.00 | 229.38 | true | 0.768950;0.770346;0.769798;0.768268;0.767606 | 1605632;1605632;1605632;1605632;1605632 | 16544;17824;14368;15520;15008 | 1592096;1586464;1591200;1609440;1580768 | |
213 | densenet0_stage3_conv21_fwd | Convolution | [32,128,14,14] | 39548 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1038112.00 | 12.50 | 271.33 | 7897.32 | false | 0.124820;0.124831;0.124831;0.124826;0.124831 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1036448;1035520;1039136;1038752;1039200 | |
213 | densenet0_stage3_conv21_fwd | Convolution | [32,128,14,14] | 39548 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 244149.33 | 6.20 | 0.61 | 47.51 | true | 0.062361;0.062354;0.062365;0.062367;0.062359 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 249952;245440;244672;242336;235744 | |
214 | densenet0_stage3_concat10 | Concat | [32,576,14,14] | 1210.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 29.00 | 0 | 7626885.33 | 8009024.00 | 73.70 | 0.00 | 0.00 | true | 0.851246;0.620770;0.853088;0.617485;0.850228;0.626615;0.849160;0.626203;0.849465;0.617880 | 0;0;0;0;0;0;0;0;0;0 | 14455360;802976;14450752;803040;14450752;802976;14450752;803040;14450752;802976 | 15107872;915008;15106528;913920;15087424;930816;15095648;928416;15095264;916576 | |
214 | densenet0_stage3_concat10 | Concat | [32,576,14,14] | 1210.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 23.29 | 0 | 7626885.33 | 8009024.00 | 73.70 | 0.00 | 0.00 | true | 0.851246;0.620770;0.853088;0.617485;0.850228;0.626615;0.849160;0.626203;0.849465;0.617880 | 0;0;0;0;0;0;0;0;0;0 | 14455360;802976;14450752;803040;14450752;802976;14450752;803040;14450752;802976 | 15107872;915008;15106528;913920;15087424;930816;15095648;928416;15095264;916576 | |
215 | densenet0_stage3_batchnorm22_fwd | BatchNorm | [32,608,14,14] | 2346.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 46.00 | 24125440 | 15226186.67 | 16457184.00 | 82.60 | 0.76 | 524.47 | true | 0.826456;0.828559;0.824330;0.823470;0.827114 | 24125440;24125440;24125440;24125440;24125440 | 16464800;16453824;16449760;16452928;16472800 | 15233856;15223168;15226528;15228864;15220832 | |
216 | densenet0_stage3_relu22_fwd | Activation | [32,608,14,14] | 1050.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 43.67 | 7626752 | 15253866.67 | 15240437.33 | 93.80 | 0.25 | 174.66 | true | 0.940304;0.936092;0.936183;0.940591;0.933515 | 7626752;7626752;7626752;7626752;7626752 | 15253888;15253856;15254112;15253856;15253856 | 15240928;15239680;15242560;15240704;15234304 | |
217 | densenet0_stage3_conv22_fwd | Convolution | [32,608,14,14] | 58554.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 133.00 | 977829888 | 19630826.67 | 4461813.33 | 8.20 | 40.59 | 7352.10 | false | 0.081975;0.081907;0.081962;0.082100;0.081933 | 977829888;977829888;977829888;977829888;977829888 | 19534592;19605984;19751904;19498976;19897952 | 4477120;4456448;4451872;4406496;4483424 | |
217 | densenet0_stage3_conv22_fwd | Convolution | [32,608,14,14] | 58554.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058111;0.058072;0.057940;0.058073;0.057916 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2432;2432 | |
218 | densenet0_stage3_batchnorm23_fwd | BatchNorm | [32,128,14,14] | 745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1334613.33 | 2502037.33 | 43.60 | 1.32 | 411.83 | true | 0.434601;0.435926;0.439605;0.434788;0.438177 | 5079040;5079040;5079040;5079040;5079040 | 1340448;1337760;1322048;1325632;1348768 | 2480672;2512032;2513408;2558336;2476256 | |
219 | densenet0_stage3_relu23_fwd | Activation | [32,128,14,14] | 259.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 15477.33 | 1622357.33 | 77.70 | 0.98 | 229.38 | true | 0.776564;0.778811;0.774890;0.779508;0.774109 | 1605632;1605632;1605632;1605632;1605632 | 13984;16544;18848;15776;14112 | 1626400;1619872;1608320;1627328;1620800 | |
220 | densenet0_stage3_conv23_fwd | Convolution | [32,128,14,14] | 39566.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1025653.33 | 12.50 | 274.63 | 7824.27 | false | 0.124830;0.124827;0.124824;0.124824;0.124827 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1023648;1027072;1024608;1025280;1029184 | |
220 | densenet0_stage3_conv23_fwd | Convolution | [32,128,14,14] | 39566.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147648.00 | 245504.00 | 6.20 | 0.60 | 54.83 | true | 0.062355;0.062356;0.062355;0.062351;0.062356 | 237568;237568;237568;237568;237568 | 147648;147648;147904;147648;147648 | 237792;253536;245024;238272;253216 | |
221 | densenet0_stage3_concat11 | Concat | [32,608,14,14] | 1117.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 30.86 | 0 | 8028272.00 | 8394416.00 | 74.30 | 0.00 | 0.00 | true | 0.852983;0.627562;0.851438;0.627848;0.851745;0.622930;0.852455;0.639552;0.851356;0.633485 | 0;0;0;0;0;0;0;0;0;0 | 15253568;802976;15253568;802976;15253568;802976;15253568;802976;15253568;802976 | 15894176;895616;15893728;888160;15893952;889632;15894816;894784;15897152;894240 | |
221 | densenet0_stage3_concat11 | Concat | [32,608,14,14] | 1117.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 24.86 | 0 | 8028272.00 | 8394416.00 | 74.30 | 0.00 | 0.00 | true | 0.852983;0.627562;0.851438;0.627848;0.851745;0.622930;0.852455;0.639552;0.851356;0.633485 | 0;0;0;0;0;0;0;0;0;0 | 15253568;802976;15253568;802976;15253568;802976;15253568;802976;15253568;802976 | 15894176;895616;15893728;888160;15893952;889632;15894816;894784;15897152;894240 | |
222 | densenet0_stage3_batchnorm24_fwd | BatchNorm | [32,640,14,14] | 3398.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 48.33 | 25395200 | 16027349.33 | 17378528.00 | 82.50 | 0.76 | 525.42 | true | 0.825567;0.823069;0.824155;0.825123;0.824824 | 25395200;25395200;25395200;25395200;25395200 | 16027744;16026400;16032576;16027904;16025632 | 17374304;17385920;17380544;17372992;17380736 | |
223 | densenet0_stage3_relu24_fwd | Activation | [32,640,14,14] | 1107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 45.00 | 8028160 | 16056682.67 | 16047072.00 | 94.50 | 0.25 | 178.40 | true | 0.945180;0.944638;0.944249;0.947523;0.943813 | 8028160;8028160;8028160;8028160;8028160 | 16056704;16056672;16056672;16056672;16061792 | 16049376;16035904;16050016;16046304;16045536 | |
224 | densenet0_stage3_conv24_fwd | Convolution | [32,640,14,14] | 60663.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 139.67 | 1029210112 | 21318560.00 | 4479456.00 | 8.20 | 39.89 | 7369.03 | false | 0.082004;0.081998;0.081736;0.081889;0.081806 | 1029210112;1029210112;1029210112;1029210112;1029210112 | 21250400;21289792;21324768;21435808;21341120 | 4484032;4498368;4479296;4471136;4475040 | |
224 | densenet0_stage3_conv24_fwd | Convolution | [32,640,14,14] | 60663.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058100;0.058049;0.057917;0.058048;0.057937 | 0;0;0;0;0 | 96;96;96;96;1376 | 2560;2432;2432;2432;4608 | |
225 | densenet0_stage3_batchnorm25_fwd | BatchNorm | [32,128,14,14] | 701 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1299434.67 | 2481952.00 | 43.50 | 1.34 | 411.83 | true | 0.436126;0.433402;0.440204;0.435380;0.430590 | 5079040;5079040;5079040;5079040;5079040 | 2488704;2477984;2479168;2489984;2476704 | 1301056;1299456;1290176;1298368;1300480 | |
226 | densenet0_stage3_relu25_fwd | Activation | [32,128,14,14] | 282.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 15008.00 | 1576800.00 | 77.10 | 1.01 | 229.38 | true | 0.772500;0.767073;0.771016;0.774739;0.770436 | 1605632;1605632;1605632;1605632;1605632 | 13856;19360;15648;15136;14240 | 1569952;1599616;1574528;1569152;1585920 | |
227 | densenet0_stage3_conv25_fwd | Convolution | [32,128,14,14] | 39621 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 2474.67 | 1079349.33 | 12.50 | 260.37 | 7824.27 | false | 0.124828;0.124826;0.124828;0.124822;0.124834 | 281673728;281673728;281673728;281673728;281673728 | 1081344;1082336;1069280;1086048;1074368 | 0;8192;0;0;7424 | |
227 | densenet0_stage3_conv25_fwd | Convolution | [32,128,14,14] | 39621 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 203008.00 | 6.20 | 0.68 | 47.51 | true | 0.062354;0.062366;0.062356;0.062358;0.062356 | 237568;237568;237568;237568;237568 | 196128;197824;212896;203872;207328 | 147648;147648;147648;147648;147648 | |
228 | densenet0_stage3_concat12 | Concat | [32,640,14,14] | 1395.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 32.14 | 0 | 8429744.00 | 8829680.00 | 74.20 | 0.00 | 0.00 | true | 0.851099;0.626535;0.851695;0.628730;0.852194;0.634977;0.852052;0.620950;0.853317;0.635766 | 0;0;0;0;0;0;0;0;0;0 | 16056384;803072;16056384;803104;16056384;803040;16056384;803104;16056384;803104 | 16755104;913568;16757216;901568;16745824;903904;16756768;902912;16762176;899872 | |
228 | densenet0_stage3_concat12 | Concat | [32,640,14,14] | 1395.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 25.86 | 0 | 8429744.00 | 8829680.00 | 74.20 | 0.00 | 0.00 | true | 0.851099;0.626535;0.851695;0.628730;0.852194;0.634977;0.852052;0.620950;0.853317;0.635766 | 0;0;0;0;0;0;0;0;0;0 | 16056384;803072;16056384;803104;16056384;803040;16056384;803104;16056384;803104 | 16755104;913568;16757216;901568;16745824;903904;16756768;902912;16762176;899872 | |
229 | densenet0_stage3_batchnorm26_fwd | BatchNorm | [32,672,14,14] | 2755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 50.33 | 26664960 | 16845397.33 | 18237408.00 | 83.50 | 0.76 | 529.77 | true | 0.836117;0.835337;0.833412;0.833216;0.834822 | 26664960;26664960;26664960;26664960;26664960 | 16847072;16846368;16844512;16845312;16844000 | 18226752;18244096;18234784;18235488;18241952 | |
230 | densenet0_stage3_relu26_fwd | Activation | [32,672,14,14] | 1432.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 47.33 | 8429568 | 16859498.67 | 16835658.67 | 94.40 | 0.25 | 178.09 | true | 0.941644;0.948447;0.941799;0.944031;0.946225 | 8429568;8429568;8429568;8429568;8429568 | 16840224;16828736;16839808;16836416;16830752 | 16859488;16859488;16859520;16859488;16859552 | |
231 | densenet0_stage3_conv26_fwd | Convolution | [32,672,14,14] | 62530.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 145.67 | 1080590336 | 22430261.33 | 4652789.33 | 8.20 | 39.90 | 7418.22 | false | 0.081533;0.081922;0.081759;0.081770;0.081922 | 1080590336;1080590336;1080590336;1080590336;1080590336 | 22390080;22430560;22461536;22462304;22398688 | 4617280;4661344;4656928;4654048;4647392 | |
231 | densenet0_stage3_conv26_fwd | Convolution | [32,672,14,14] | 62530.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058676;0.058118;0.057940;0.058043;0.057937 | 0;0;0;0;0 | 96;96;96;96;96 | 2816;2432;2432;2432;2432 | |
232 | densenet0_stage3_batchnorm27_fwd | BatchNorm | [32,128,14,14] | 727.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1392277.33 | 2310602.67 | 43.90 | 1.37 | 411.83 | true | 0.444370;0.433826;0.439630;0.440189;0.436468 | 5079040;5079040;5079040;5079040;5079040 | 1394400;1393120;1400192;1381760;1389312 | 2343168;2294912;2299808;2314240;2317760 | |
233 | densenet0_stage3_relu27_fwd | Activation | [32,128,14,14] | 316.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 14282.67 | 1614016.00 | 77.80 | 0.99 | 240.83 | true | 0.777714;0.774909;0.786309;0.780092;0.774759 | 1605632;1605632;1605632;1605632;1605632 | 12832;15776;12704;14880;15136 | 1613888;1605280;1599072;1622880;1627424 | |
234 | densenet0_stage3_conv27_fwd | Convolution | [32,128,14,14] | 39508 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1071210.67 | 12.50 | 262.95 | 7897.32 | false | 0.124823;0.124833;0.124830;0.124828;0.124831 | 281673728;281673728;281673728;281673728;281673728 | 0;0;1280;0;0 | 1072544;1069536;1059744;1072768;1071552 | |
234 | densenet0_stage3_conv27_fwd | Convolution | [32,128,14,14] | 39508 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147648.00 | 203872.00 | 6.20 | 0.68 | 54.83 | true | 0.062352;0.062358;0.062367;0.062353;0.062367 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 208928;210912;198592;198912;203776 | |
235 | densenet0_stage3_concat13 | Concat | [32,672,14,14] | 1228 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 33.00 | 0 | 8831157.33 | 9249925.33 | 73.90 | 0.00 | 0.00 | true | 0.853157;0.617414;0.851790;0.615969;0.853067;0.628263;0.853345;0.619042;0.853028;0.631174 | 0;0;0;0;0;0;0;0;0;0 | 17591136;906336;17586944;912096;17584736;918400;17591040;902336;17592320;901120 | 16859200;803072;16859200;803136;16859200;803008;16859200;803072;16859200;803136 | |
235 | densenet0_stage3_concat13 | Concat | [32,672,14,14] | 1228 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 26.29 | 0 | 8831157.33 | 9249925.33 | 73.90 | 0.00 | 0.00 | true | 0.853157;0.617414;0.851790;0.615969;0.853067;0.628263;0.853345;0.619042;0.853028;0.631174 | 0;0;0;0;0;0;0;0;0;0 | 16859200;803072;16859200;803136;16859200;803008;16859200;803072;16859200;803136 | 17591136;906336;17586944;912096;17584736;918400;17591040;902336;17592320;901120 | |
236 | densenet0_stage3_batchnorm28_fwd | BatchNorm | [32,704,14,14] | 4020 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 53.00 | 27934720 | 17645237.33 | 19097717.33 | 83.60 | 0.76 | 527.07 | true | 0.837550;0.834183;0.836253;0.834804;0.835501 | 27934720;27934720;27934720;27934720;27934720 | 17643968;17643072;17645920;17645824;17649024 | 19101728;19102656;19088768;19088512;19114176 | |
237 | densenet0_stage3_relu28_fwd | Activation | [32,704,14,14] | 1493 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 50.00 | 8830976 | 17662304.00 | 17647904.00 | 94.10 | 0.25 | 176.62 | true | 0.942542;0.940302;0.938939;0.941442;0.940787 | 8830976;8830976;8830976;8830976;8830976 | 17653312;17637024;17648320;17655360;17642080 | 17662304;17662304;17662304;17662304;17662304 | |
238 | densenet0_stage3_conv28_fwd | Convolution | [32,704,14,14] | 67217 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 152.33 | 1131970560 | 24568384.00 | 4626122.67 | 8.20 | 38.77 | 7430.90 | false | 0.081777;0.081655;0.081671;0.081765;0.081818 | 1131970560;1131970560;1131970560;1131970560;1131970560 | 24614656;24387008;24531968;24605408;24567776 | 4607808;4636224;4634336;4602368;4644768 | |
238 | densenet0_stage3_conv28_fwd | Convolution | [32,704,14,14] | 67217 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058181;0.058048;0.057944;0.058651;0.057902 | 0;0;0;0;0 | 2432;2432;2432;2432;2432 | 96;96;96;96;96 | |
239 | densenet0_stage3_batchnorm29_fwd | BatchNorm | [32,128,14,14] | 713.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 13.00 | 5079040 | 1392341.33 | 2338826.67 | 42.60 | 1.36 | 390.70 | true | 0.425717;0.421668;0.429962;0.423524;0.428950 | 5079040;5079040;5079040;5079040;5079040 | 1388512;1393696;1394816;1424704;1387808 | 2349984;2342208;2329280;2344992;2321920 | |
240 | densenet0_stage3_relu29_fwd | Activation | [32,128,14,14] | 322.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 14752.00 | 1612533.33 | 77.10 | 0.99 | 240.83 | true | 0.770138;0.769541;0.771545;0.774873;0.770723 | 1605632;1605632;1605632;1605632;1605632 | 15392;16288;13472;15392;13472 | 1602272;1615904;1592224;1631424;1619424 | |
241 | densenet0_stage3_conv29_fwd | Convolution | [32,128,14,14] | 39627.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1123445.33 | 12.50 | 250.72 | 7897.32 | false | 0.124830;0.124830;0.124827;0.124824;0.124829 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1108640;1121696;1123328;1125312;1127616 | |
241 | densenet0_stage3_conv29_fwd | Convolution | [32,128,14,14] | 39627.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147648.00 | 166848.00 | 6.20 | 0.76 | 54.83 | true | 0.062356;0.062352;0.062356;0.062367;0.062358 | 237568;237568;237568;237568;237568 | 179296;167552;164544;161056;168448 | 147648;147648;147648;147648;147648 | |
242 | densenet0_stage3_concat14 | Concat | [32,704,14,14] | 1798.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 34.86 | 0 | 9234133.33 | 9663034.67 | 74.30 | 0.00 | 0.00 | true | 0.853571;0.620687;0.853078;0.633300;0.853399;0.629955;0.853223;0.624634;0.853682;0.636767 | 0;0;0;0;0;0;0;0;0;0 | 17662016;803040;17662016;803008;17667136;803008;17667136;802976;17665088;809632 | 18433856;891424;18430016;897440;18433312;890176;18434496;896352;18429440;891648 | |
242 | densenet0_stage3_concat14 | Concat | [32,704,14,14] | 1798.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 27.86 | 0 | 9234133.33 | 9663034.67 | 74.30 | 0.00 | 0.00 | true | 0.853571;0.620687;0.853078;0.633300;0.853399;0.629955;0.853223;0.624634;0.853682;0.636767 | 0;0;0;0;0;0;0;0;0;0 | 17662016;803040;17662016;803008;17667136;803008;17667136;802976;17665088;809632 | 18433856;891424;18430016;897440;18433312;890176;18434496;896352;18429440;891648 | |
243 | densenet0_stage3_batchnorm30_fwd | BatchNorm | [32,736,14,14] | 2968.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 54.67 | 29204480 | 18468896.00 | 19969824.00 | 83.90 | 0.76 | 534.23 | true | 0.837850;0.841491;0.840258;0.837589;0.839051 | 29204480;29204480;29204480;29204480;29204480 | 18470816;18468064;18470272;18467712;18468352 | 19966240;19964288;19981568;19963424;19978944 | |
244 | densenet0_stage3_relu30_fwd | Activation | [32,736,14,14] | 1559.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 51.33 | 9232384 | 18465120.00 | 18456928.00 | 94.00 | 0.25 | 179.85 | true | 0.937709;0.942166;0.936662;0.940991;0.939925 | 9232384;9232384;9232384;9232384;9232384 | 18459232;18473472;18451328;18456224;18455328 | 18465120;18465120;18465120;18465120;18465120 | |
245 | densenet0_stage3_conv30_fwd | Convolution | [32,736,14,14] | 69434.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 158.67 | 1183350784 | 26029130.67 | 4879285.33 | 8.20 | 38.29 | 7458.08 | false | 0.081883;0.081961;0.082340;0.081963;0.082191 | 1183350784;1183350784;1183350784;1183350784;1183350784 | 4882688;4871488;4846816;4883680;4891424 | 25937088;26080000;25998400;26008992;26166048 | |
245 | densenet0_stage3_conv30_fwd | Convolution | [32,736,14,14] | 69434.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058078;0.058065;0.057946;0.058090;0.057944 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;4864;2432;2432 | |
246 | densenet0_stage3_batchnorm31_fwd | BatchNorm | [32,128,14,14] | 735 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.67 | 5079040 | 1539381.33 | 2070101.33 | 43.80 | 1.41 | 400.97 | true | 0.430018;0.431209;0.437097;0.450694;0.444998 | 5079040;5079040;5079040;5079040;5079040 | 1530976;1537856;1555168;1537728;1542560 | 2071008;2070400;2088800;2068896;2056640 | |
247 | densenet0_stage3_relu31_fwd | Activation | [32,128,14,14] | 323.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 1605632 | 14240.00 | 1604768.00 | 77.40 | 0.99 | 253.53 | true | 0.773700;0.772038;0.779718;0.775374;0.772435 | 1605632;1605632;1605632;1605632;1605632 | 15392;15648;14112;12320;13216 | 1603072;1607712;1603520;1577216;1624384 | |
248 | densenet0_stage3_conv31_fwd | Convolution | [32,128,14,14] | 39614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1123072.00 | 12.50 | 250.81 | 7824.27 | false | 0.124830;0.124827;0.124831;0.124826;0.124828 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1126752;1111680;1121888;1124864;1122464 | |
248 | densenet0_stage3_conv31_fwd | Convolution | [32,128,14,14] | 39614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 167456.00 | 6.20 | 0.75 | 47.51 | true | 0.062357;0.062358;0.062367;0.062355;0.062358 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 163808;176384;171744;166816;163776 | |
249 | densenet0_stage3_concat15 | Concat | [32,736,14,14] | 1439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 36.00 | 0 | 9633930.67 | 10087109.33 | 73.80 | 0.00 | 0.00 | true | 0.856105;0.609542;0.854961;0.618995;0.855203;0.622279;0.856732;0.614643;0.857951;0.619108 | 0;0;0;0;0;0;0;0;0;0 | 18464832;802976;18464832;803008;18464832;803008;18464832;802976;18464832;803072 | 19272288;906720;19260448;913792;19263072;910816;19262112;909728;19261216;914272 | |
249 | densenet0_stage3_concat15 | Concat | [32,736,14,14] | 1439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 28.71 | 0 | 9633930.67 | 10087109.33 | 73.80 | 0.00 | 0.00 | true | 0.856105;0.609542;0.854961;0.618995;0.855203;0.622279;0.856732;0.614643;0.857951;0.619108 | 0;0;0;0;0;0;0;0;0;0 | 19272288;906720;19260448;913792;19263072;910816;19262112;909728;19261216;914272 | 18464832;802976;18464832;803008;18464832;803008;18464832;802976;18464832;803072 | |
250 | densenet0_stage3_batchnorm32_fwd | BatchNorm | [32,768,14,14] | 3324.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 57.00 | 30474240 | 19291765.33 | 20818837.33 | 84.40 | 0.76 | 534.64 | true | 0.845060;0.843395;0.843078;0.844134;0.842251 | 30474240;30474240;30474240;30474240;30474240 | 20835648;20818720;20819488;20818304;20815968 | 19286304;19293952;19294080;19291968;19289376 | |
251 | densenet0_stage3_relu32_fwd | Activation | [32,768,14,14] | 1630.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 53.67 | 9633792 | 19267936.00 | 19257408.00 | 94.70 | 0.25 | 179.51 | true | 0.941630;0.947726;0.946901;0.947402;0.945851 | 9633792;9633792;9633792;9633792;9633792 | 19267936;19267936;19267968;19267936;19267936 | 19252768;19257440;19257312;19263296;19257472 | |
252 | densenet0_stage3_conv32_fwd | Convolution | [32,768,14,14] | 71403 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 165.00 | 1234731008 | 27383978.67 | 4952618.67 | 8.20 | 38.18 | 7483.22 | false | 0.082034;0.081629;0.081536;0.081531;0.081598 | 1234731008;1234731008;1234731008;1234731008;1234731008 | 27439552;27485856;27302304;27410080;27187936 | 4961760;4961248;4945792;4942208;4950816 | |
252 | densenet0_stage3_conv32_fwd | Convolution | [32,768,14,14] | 71403 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058094;0.058089;0.057940;0.058072;0.057891 | 0;0;0;0;0 | 4960;96;96;96;96 | 11904;2432;2432;2432;2432 | |
253 | densenet0_stage3_batchnorm33_fwd | BatchNorm | [32,128,14,14] | 734.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1576021.33 | 2001472.00 | 42.60 | 1.42 | 411.83 | true | 0.425650;0.422248;0.425035;0.427506;0.432094 | 5079040;5079040;5079040;5079040;5079040 | 1582752;1589664;1556928;1553952;1588384 | 1989760;2010976;2000352;2003840;2000224 | |
254 | densenet0_stage3_relu33_fwd | Activation | [32,128,14,14] | 317.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 13557.33 | 1612917.33 | 76.90 | 0.99 | 229.38 | true | 0.768451;0.771031;0.767221;0.766525;0.770241 | 1605632;1605632;1605632;1605632;1605632 | 12832;12960;14240;15520;13472 | 1612128;1613856;1573024;1612768;1623392 | |
255 | densenet0_stage3_conv33_fwd | Convolution | [32,128,14,14] | 39501.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 281673728 | 0.00 | 1036416.00 | 12.50 | 271.78 | 8047.82 | false | 0.124826;0.124838;0.124833;0.124827;0.124826 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1037536;1035360;1042784;1032960;1036352 | |
255 | densenet0_stage3_conv33_fwd | Convolution | [32,128,14,14] | 39501.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 244213.33 | 6.20 | 0.61 | 47.51 | true | 0.062360;0.062349;0.062352;0.062363;0.062358 | 237568;237568;237568;237568;237568 | 241728;245568;245344;245952;239296 | 147648;147648;147648;151744;147648 | |
256 | densenet0_stage3_concat16 | Concat | [32,768,14,14] | 1919.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 37.14 | 0 | 10035317.33 | 10516256.00 | 74.20 | 0.00 | 0.00 | true | 0.855092;0.610950;0.855085;0.622328;0.854698;0.617403;0.854717;0.623919;0.853872;0.644197 | 0;0;0;0;0;0;0;0;0;0 | 20116256;918336;20106496;931008;20102688;925632;20106560;925152;20114976;919136 | 19267648;802976;19267648;802976;19267648;803008;19267648;802976;19267648;802976 | |
256 | densenet0_stage3_concat16 | Concat | [32,768,14,14] | 1919.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 29.57 | 0 | 10035317.33 | 10516256.00 | 74.20 | 0.00 | 0.00 | true | 0.855092;0.610950;0.855085;0.622328;0.854698;0.617403;0.854717;0.623919;0.853872;0.644197 | 0;0;0;0;0;0;0;0;0;0 | 19267648;802976;19267648;802976;19267648;803008;19267648;802976;19267648;802976 | 20116256;918336;20106496;931008;20102688;925632;20106560;925152;20114976;919136 | |
257 | densenet0_stage3_batchnorm34_fwd | BatchNorm | [32,800,14,14] | 3403 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 58.67 | 31744000 | 20102378.67 | 21696768.00 | 84.40 | 0.76 | 541.09 | true | 0.842149;0.845273;0.845316;0.842161;0.845662 | 31744000;31744000;31744000;31744000;31744000 | 20099584;20097472;20105024;20102528;20105280 | 21695936;21695040;21698144;21706016;21696224 | |
258 | densenet0_stage3_relu34_fwd | Activation | [32,800,14,14] | 1425.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 55.33 | 10035200 | 20070752.00 | 20055605.33 | 94.80 | 0.25 | 181.36 | true | 0.950096;0.949264;0.947899;0.946066;0.946253 | 10035200;10035200;10035200;10035200;10035200 | 20070752;20070784;20070752;20070752;20070752 | 20059328;20051520;20053504;20063840;20053984 | |
259 | densenet0_stage3_conv34_fwd | Convolution | [32,800,14,14] | 76130.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 172.67 | 1286111232 | 28940618.67 | 4895477.33 | 8.20 | 38.01 | 7448.51 | false | 0.082286;0.082241;0.081988;0.082154;0.082217 | 1286111232;1286111232;1286111232;1286111232;1286111232 | 4896448;4898048;4891168;4894432;4895552 | 28933312;29187936;28802048;29086496;28742592 | |
259 | densenet0_stage3_conv34_fwd | Convolution | [32,800,14,14] | 76130.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058109;0.058586;0.057940;0.058060;0.057928 | 0;0;0;0;0 | 2432;2432;2432;2432;2432 | 96;96;96;96;96 | |
260 | densenet0_stage3_batchnorm35_fwd | BatchNorm | [32,128,14,14] | 753.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1598005.33 | 2056874.67 | 42.60 | 1.39 | 411.83 | true | 0.427657;0.421333;0.428915;0.423661;0.425570 | 5079040;5079040;5079040;5079040;5079040 | 1582432;1597088;1583040;1638688;1613888 | 2058848;2048512;2075552;2047616;2063264 | |
261 | densenet0_stage3_relu35_fwd | Activation | [32,128,14,14] | 314.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 15605.33 | 1599562.67 | 77.60 | 0.99 | 229.38 | true | 0.776266;0.775219;0.772268;0.784563;0.777589 | 1605632;1605632;1605632;1605632;1605632 | 14880;16544;14880;15392;18976 | 1605696;1600608;1612864;1591840;1592384 | |
262 | densenet0_stage3_conv35_fwd | Convolution | [32,128,14,14] | 39627.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1060053.33 | 12.50 | 265.72 | 7897.32 | false | 0.124836;0.124824;0.124826;0.124834;0.124833 | 281673728;281673728;281673728;281673728;281673728 | 32;0;0;0;0 | 1057696;1048192;1062976;1064416;1059488 | |
262 | densenet0_stage3_conv35_fwd | Convolution | [32,128,14,14] | 39627.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147733.33 | 216874.67 | 6.20 | 0.65 | 50.90 | true | 0.062361;0.062358;0.062363;0.062358;0.062357 | 237568;237568;237568;237568;237568 | 147904;147648;147648;147904;147648 | 210688;223584;218528;217664;214432 | |
263 | densenet0_stage3_concat17 | Concat | [32,800,14,14] | 1559.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 38.71 | 0 | 10436762.67 | 10919925.33 | 74.10 | 0.00 | 0.00 | true | 0.855119;0.628526;0.855659;0.625869;0.854237;0.620315;0.856449;0.627868;0.856378;0.626656 | 0;0;0;0;0;0;0;0;0;0 | 20070464;802976;20070464;802976;20070464;802976;20070464;802976;20070464;803232 | 20949824;903808;20946784;894880;20942208;899520;20932320;900256;20941440;898080 | |
263 | densenet0_stage3_concat17 | Concat | [32,800,14,14] | 1559.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 30.57 | 0 | 10436762.67 | 10919925.33 | 74.10 | 0.00 | 0.00 | true | 0.855119;0.628526;0.855659;0.625869;0.854237;0.620315;0.856449;0.627868;0.856378;0.626656 | 0;0;0;0;0;0;0;0;0;0 | 20070464;802976;20070464;802976;20070464;802976;20070464;802976;20070464;803232 | 20949824;903808;20946784;894880;20942208;899520;20932320;900256;20941440;898080 | |
264 | densenet0_stage3_batchnorm36_fwd | BatchNorm | [32,832,14,14] | 4793.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 61.00 | 33013760 | 20915189.33 | 22583232.00 | 84.70 | 0.76 | 541.21 | true | 0.847613;0.846207;0.849281;0.846733;0.847149 | 33013760;33013760;33013760;33013760;33013760 | 22578400;22580416;22591200;22590880;22576032 | 20918048;20916320;20911200;20920096;20907968 | |
265 | densenet0_stage3_relu36_fwd | Activation | [32,832,14,14] | 1755.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 58.00 | 10436608 | 20873578.67 | 20862773.33 | 94.50 | 0.25 | 179.94 | true | 0.946930;0.950765;0.943968;0.942827;0.944413 | 10436608;10436608;10436608;10436608;10436608 | 20873600;20873568;20873600;20873568;20873568 | 20865056;20867232;20858624;20864576;20858688 | |
266 | densenet0_stage3_conv36_fwd | Convolution | [32,832,14,14] | 78421.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 178.00 | 1337491456 | 30581642.67 | 5167861.33 | 8.20 | 37.41 | 7514.00 | false | 0.082100;0.082461;0.081934;0.081924;0.082186 | 1337491456;1337491456;1337491456;1337491456;1337491456 | 30584032;30615232;30545664;30674528;30528672 | 5166080;5153984;5167968;5169536;5172224 | |
266 | densenet0_stage3_conv36_fwd | Convolution | [32,832,14,14] | 78421.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058126;0.058036;0.057951;0.058077;0.058506 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
267 | densenet0_stage3_batchnorm37_fwd | BatchNorm | [32,128,14,14] | 741.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.67 | 5079040 | 1718197.33 | 1784448.00 | 40.90 | 1.45 | 400.97 | true | 0.407133;0.412205;0.416308;0.408459;0.405226 | 5079040;5079040;5079040;5079040;5079040 | 1790464;1783168;1786656;1783040;1783520 | 1712096;1721344;1721152;1755616;1705312 | |
268 | densenet0_stage3_relu37_fwd | Activation | [32,128,14,14] | 316.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 1605632 | 15520.00 | 1612885.33 | 76.90 | 0.99 | 253.53 | true | 0.767692;0.763059;0.772147;0.771071;0.768618 | 1605632;1605632;1605632;1605632;1605632 | 15136;15136;15264;16160;16160 | 1610112;1615296;1640608;1597984;1613248 | |
269 | densenet0_stage3_conv37_fwd | Convolution | [32,128,14,14] | 39575 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1127722.67 | 12.50 | 249.77 | 7897.32 | false | 0.124830;0.124831;0.124824;0.124826;0.124834 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1130592;1125024;1132864;1116864;1127552 | |
269 | densenet0_stage3_conv37_fwd | Convolution | [32,128,14,14] | 39575 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 169813.33 | 6.20 | 0.75 | 50.90 | true | 0.062355;0.062353;0.062361;0.062361;0.062352 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 168768;173376;157088;179392;167296 | |
270 | densenet0_stage3_concat18 | Concat | [32,832,14,14] | 2096.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 39.71 | 0 | 10838362.67 | 11350784.00 | 74.30 | 0.00 | 0.00 | true | 0.855966;0.614589;0.855954;0.629748;0.855433;0.622393;0.855095;0.634867;0.855242;0.626048 | 0;0;0;0;0;0;0;0;0;0 | 21756448;946016;21757536;944608;21776800;930496;21762272;940288;21759808;936768 | 20873280;803104;20873280;803072;20873280;804064;20873280;803168;20873280;803040 | |
270 | densenet0_stage3_concat18 | Concat | [32,832,14,14] | 2096.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 31.43 | 0 | 10838362.67 | 11350784.00 | 74.30 | 0.00 | 0.00 | true | 0.855966;0.614589;0.855954;0.629748;0.855433;0.622393;0.855095;0.634867;0.855242;0.626048 | 0;0;0;0;0;0;0;0;0;0 | 21756448;946016;21757536;944608;21776800;930496;21762272;940288;21759808;936768 | 20873280;803104;20873280;803072;20873280;804064;20873280;803168;20873280;803040 | |
271 | densenet0_stage3_batchnorm38_fwd | BatchNorm | [32,864,14,14] | 3660.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 63.00 | 34283520 | 21737226.67 | 23443829.33 | 85.00 | 0.76 | 544.18 | true | 0.849128;0.849850;0.850581;0.849871;0.846672 | 34283520;34283520;34283520;34283520;34283520 | 21734304;21741056;21735168;21736736;21739776 | 23450080;23444064;23444032;23432352;23443392 | |
272 | densenet0_stage3_relu38_fwd | Activation | [32,864,14,14] | 1482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 59.67 | 10838016 | 21676384.00 | 21655808.00 | 94.10 | 0.25 | 181.64 | true | 0.941084;0.939850;0.940081;0.942557;0.940966 | 10838016;10838016;10838016;10838016;10838016 | 21677728;21676384;21676384;21676384;21676384 | 21656640;21652512;21652192;21661728;21658272 | |
273 | densenet0_stage3_conv38_fwd | Convolution | [32,864,14,14] | 80053.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 184.00 | 1388871680 | 32002826.67 | 5039072.00 | 8.20 | 37.49 | 7548.22 | false | 0.082115;0.082153;0.082381;0.082190;0.081965 | 1388871680;1388871680;1388871680;1388871680;1388871680 | 32153536;31912416;32085568;31945408;31977504 | 5038912;5037472;5040832;5010464;5044160 | |
273 | densenet0_stage3_conv38_fwd | Convolution | [32,864,14,14] | 80053.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058092;0.058043;0.057942;0.058048;0.057902 | 0;0;0;0;0 | 2432;2432;2432;2432;2432 | 96;96;96;96;96 | |
274 | densenet0_stage3_batchnorm39_fwd | BatchNorm | [32,128,14,14] | 774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 13.00 | 5079040 | 1677802.67 | 1923253.33 | 41.20 | 1.41 | 390.70 | true | 0.407484;0.415245;0.408501;0.413640;0.412520 | 5079040;5079040;5079040;5079040;5079040 | 1915936;1928608;1914272;1935200;1925216 | 1680128;1674528;1678752;1716192;1663872 | |
275 | densenet0_stage3_relu39_fwd | Activation | [32,128,14,14] | 266.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 14794.67 | 1612330.67 | 77.50 | 0.99 | 229.38 | true | 0.774244;0.778456;0.775134;0.775865;0.773704 | 1605632;1605632;1605632;1605632;1605632 | 14368;14880;13216;15136;15136 | 1634112;1625536;1604480;1606784;1604672 | |
276 | densenet0_stage3_conv39_fwd | Convolution | [32,128,14,14] | 39601.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1021536.00 | 12.50 | 275.74 | 7897.32 | false | 0.124829;0.124833;0.124828;0.124838;0.124833 | 281673728;281673728;281673728;281673728;281673728 | 1013088;1023424;1026112;1015072;1026528 | 0;0;0;0;0 | |
276 | densenet0_stage3_conv39_fwd | Convolution | [32,128,14,14] | 39601.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 255221.33 | 6.20 | 0.59 | 50.90 | true | 0.062360;0.062348;0.062364;0.062347;0.062360 | 237568;237568;237568;237568;237568 | 256832;254784;253664;257760;254048 | 147648;147648;147648;147648;147648 | |
277 | densenet0_stage3_concat19 | Concat | [32,864,14,14] | 1306.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 41.14 | 0 | 11239552.00 | 11753264.00 | 74.20 | 0.00 | 0.00 | true | 0.858037;0.618583;0.858804;0.615837;0.858739;0.619471;0.860494;0.635101;0.857326;0.620474 | 0;0;0;0;0;0;0;0;0;0 | 21676128;803008;21676096;803008;21676096;803008;21676128;802976;21676096;802976 | 22617312;897024;22615040;891296;22625984;892224;22600800;897184;22622432;891168 | |
277 | densenet0_stage3_concat19 | Concat | [32,864,14,14] | 1306.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 32.43 | 0 | 11239552.00 | 11753264.00 | 74.20 | 0.00 | 0.00 | true | 0.858037;0.618583;0.858804;0.615837;0.858739;0.619471;0.860494;0.635101;0.857326;0.620474 | 0;0;0;0;0;0;0;0;0;0 | 21676128;803008;21676096;803008;21676096;803008;21676128;802976;21676096;802976 | 22617312;897024;22615040;891296;22625984;892224;22600800;897184;22622432;891168 | |
278 | densenet0_stage3_batchnorm40_fwd | BatchNorm | [32,896,14,14] | 4789.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 66.00 | 35553280 | 22547776.00 | 24345696.00 | 85.30 | 0.76 | 538.69 | true | 0.850727;0.851308;0.854156;0.854414;0.852427 | 35553280;35553280;35553280;35553280;35553280 | 22548512;22551840;22545536;22549280;22545344 | 24344480;24351392;24335008;24352992;24341216 | |
279 | densenet0_stage3_relu40_fwd | Activation | [32,896,14,14] | 1206.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 61.67 | 11239424 | 22479200.00 | 22459114.67 | 94.50 | 0.25 | 182.26 | true | 0.944065;0.944544;0.944898;0.945846;0.944434 | 11239424;11239424;11239424;11239424;11239424 | 22479200;22479200;22479200;22479200;22479456 | 22463040;22454112;22470528;22460192;22449760 | |
280 | densenet0_stage3_conv40_fwd | Convolution | [32,896,14,14] | 84953 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 190.00 | 1440251904 | 33540842.67 | 5238912.00 | 8.20 | 37.14 | 7580.27 | false | 0.082293;0.082041;0.082337;0.082207;0.082395 | 1440251904;1440251904;1440251904;1440251904;1440251904 | 33554848;33524160;33603392;33543520;33513536 | 5227072;5252736;5238144;5242016;5236576 | |
280 | densenet0_stage3_conv40_fwd | Convolution | [32,896,14,14] | 84953 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 181.33 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058104;0.058345;0.058623;0.058060;0.058284 | 0;0;0;0;0 | 352;96;96;1376;96 | 2944;2432;2432;4480;2432 | |
281 | densenet0_stage3_batchnorm41_fwd | BatchNorm | [32,128,14,14] | 762.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1773386.67 | 1712874.67 | 40.30 | 1.46 | 423.25 | true | 0.400107;0.394007;0.405805;0.410715;0.403175 | 5079040;5079040;5079040;5079040;5079040 | 1713504;1716256;1710720;1704000;1714400 | 1781216;1769152;1763104;1769792;1782720 | |
282 | densenet0_stage3_relu41_fwd | Activation | [32,128,14,14] | 215.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 15605.33 | 1621898.67 | 76.90 | 0.98 | 240.83 | true | 0.766913;0.770057;0.775166;0.769120;0.766231 | 1605632;1605632;1605632;1605632;1605632 | 14368;15392;16288;15136;19488 | 1621536;1631872;1627488;1616672;1600928 | |
283 | densenet0_stage3_conv41_fwd | Convolution | [32,128,14,14] | 39584.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1112458.67 | 12.50 | 253.20 | 7897.32 | false | 0.124830;0.124832;0.124826;0.124842;0.124834 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1105504;1111616;1115264;1114624;1111136 | |
283 | densenet0_stage3_conv41_fwd | Convolution | [32,128,14,14] | 39584.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 179040.00 | 6.20 | 0.73 | 47.51 | true | 0.062357;0.062363;0.062354;0.062354;0.062362 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 180416;177024;171488;183136;179680 | |
284 | densenet0_stage3_concat20 | Concat | [32,896,14,14] | 1756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 42.29 | 0 | 11642192.00 | 12170944.00 | 74.30 | 0.00 | 0.00 | true | 0.856099;0.627683;0.856303;0.637947;0.857705;0.622018;0.856859;0.620116;0.857071;0.625690 | 0;0;0;0;0;0;0;0;0;0 | 22478912;802976;22478912;810432;22478912;803008;22478912;802976;22478944;802976 | 23457248;884992;23456224;877792;23459744;882336;23466016;876896;23458496;886368 | |
284 | densenet0_stage3_concat20 | Concat | [32,896,14,14] | 1756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 33.14 | 0 | 11642192.00 | 12170944.00 | 74.30 | 0.00 | 0.00 | true | 0.856099;0.627683;0.856303;0.637947;0.857705;0.622018;0.856859;0.620116;0.857071;0.625690 | 0;0;0;0;0;0;0;0;0;0 | 22478912;802976;22478912;810432;22478912;803008;22478912;802976;22478944;802976 | 23457248;884992;23456224;877792;23459744;882336;23466016;876896;23458496;886368 | |
285 | densenet0_stage3_batchnorm42_fwd | BatchNorm | [32,928,14,14] | 4090.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 67.67 | 36823040 | 23367648.00 | 25219424.00 | 85.30 | 0.76 | 544.18 | true | 0.853304;0.854331;0.851935;0.853163;0.853371 | 36823040;36823040;36823040;36823040;36823040 | 23369728;23369568;23362944;23364928;23368448 | 25229888;25229408;25213056;25210336;25215808 | |
286 | densenet0_stage3_relu42_fwd | Activation | [32,928,14,14] | 1298.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 63.00 | 11640832 | 23282026.67 | 23268682.67 | 94.90 | 0.25 | 184.78 | true | 0.948126;0.951828;0.948729;0.946732;0.949458 | 11640832;11640832;11640832;11640832;11640832 | 23282048;23282016;23282016;23282048;23282016 | 23267104;23262176;23287936;23271808;23267136 | |
287 | densenet0_stage3_conv42_fwd | Convolution | [32,928,14,14] | 87214.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 196.00 | 1491632128 | 34941013.33 | 5239029.33 | 8.20 | 37.12 | 7610.37 | false | 0.082377;0.082480;0.082301;0.082410;0.082439 | 1491632128;1491632128;1491632128;1491632128;1491632128 | 34907520;35025376;34890144;34833152;35129120 | 5229920;5240928;5233440;5242720;5248480 | |
287 | densenet0_stage3_conv42_fwd | Convolution | [32,928,14,14] | 87214.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058114;0.058056;0.058206;0.058048;0.057935 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2560 | |
288 | densenet0_stage3_batchnorm43_fwd | BatchNorm | [32,128,14,14] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1803424.00 | 1702581.33 | 40.50 | 1.45 | 423.25 | true | 0.409642;0.403415;0.405466;0.395477;0.406063 | 5079040;5079040;5079040;5079040;5079040 | 1846848;1810624;1804864;1794784;1794048 | 1703712;1700768;1702880;1712672;1701152 | |
289 | densenet0_stage3_relu43_fwd | Activation | [32,128,14,14] | 218.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 15136.00 | 1646698.67 | 77.50 | 0.97 | 229.38 | true | 0.775328;0.774317;0.776355;0.774482;0.777089 | 1605632;1605632;1605632;1605632;1605632 | 18208;14496;15392;13472;15520 | 1651744;1661280;1639328;1634176;1649024 | |
290 | densenet0_stage3_conv43_fwd | Convolution | [32,128,14,14] | 39638.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 281673728 | 0.00 | 1120970.67 | 12.50 | 251.28 | 7897.32 | false | 0.124831;0.124832;0.124823;0.124834;0.124829 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1121632;1118400;1124096;1120000;1121280 | |
290 | densenet0_stage3_conv43_fwd | Convolution | [32,128,14,14] | 39638.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 163936.00 | 6.20 | 0.76 | 50.90 | true | 0.062360;0.062354;0.062354;0.062349;0.062353 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 164576;163520;163360;163712;170240 | |
291 | densenet0_stage3_concat21 | Concat | [32,928,14,14] | 1373 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 44.00 | 0 | 12042378.67 | 12591893.33 | 74.40 | 0.00 | 0.00 | true | 0.857124;0.633655;0.857612;0.628888;0.856232;0.620731;0.856256;0.617909;0.857707;0.629869 | 0;0;0;0;0;0;0;0;0;0 | 23281728;802976;23281728;802976;23281728;803072;23281728;802976;23281760;803040 | 24273248;918112;24272320;917888;24266912;919968;24265280;913248;24263200;907264 | |
291 | densenet0_stage3_concat21 | Concat | [32,928,14,14] | 1373 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 34.57 | 0 | 12042378.67 | 12591893.33 | 74.40 | 0.00 | 0.00 | true | 0.857124;0.633655;0.857612;0.628888;0.856232;0.620731;0.856256;0.617909;0.857707;0.629869 | 0;0;0;0;0;0;0;0;0;0 | 23281728;802976;23281728;802976;23281728;803072;23281728;802976;23281760;803040 | 24273248;918112;24272320;917888;24266912;919968;24265280;913248;24263200;907264 | |
292 | densenet0_stage3_batchnorm44_fwd | BatchNorm | [32,960,14,14] | 5525.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 69.33 | 38092800 | 24182378.67 | 26081866.67 | 85.60 | 0.76 | 549.42 | true | 0.855912;0.857392;0.857801;0.855762;0.856043 | 38092800;38092800;38092800;38092800;38092800 | 24180256;24182496;24181376;24184640;24183264 | 26075616;26074304;26086752;26083232;26090368 | |
293 | densenet0_stage3_relu44_fwd | Activation | [32,960,14,14] | 1263 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 65.67 | 12042240 | 24084832.00 | 24064149.33 | 95.10 | 0.25 | 183.38 | true | 0.950980;0.952209;0.949577;0.952213;0.949070 | 12042240;12042240;12042240;12042240;12042240 | 24084832;24084864;24084832;24084832;24084832 | 24062432;24064672;24062368;24065344;24068896 | |
294 | densenet0_stage3_conv44_fwd | Convolution | [32,960,14,14] | 89227 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 202.00 | 1543012352 | 36489706.67 | 4938730.67 | 8.20 | 37.25 | 7638.68 | false | 0.082474;0.082595;0.082476;0.082468;0.082356 | 1543012352;1543012352;1543012352;1543012352;1543012352 | 4940224;4927360;4946240;4929728;4950272 | 36558208;36486336;36434528;36452352;36530432 | |
294 | densenet0_stage3_conv44_fwd | Convolution | [32,960,14,14] | 89227 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 3541.33 | 5.80 | 0.00 | 0.00 | true | 0.058068;0.058039;0.057898;0.058041;0.057928 | 0;0;0;0;0 | 96;96;96;96;96 | 7808;5760;2432;2432;2432 | |
295 | densenet0_stage3_batchnorm45_fwd | BatchNorm | [32,128,14,14] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 5079040 | 1682901.33 | 2006197.33 | 40.70 | 1.38 | 423.25 | true | 0.402985;0.410473;0.407709;0.401515;0.411730 | 5079040;5079040;5079040;5079040;5079040 | 1675040;1687712;1697024;1685952;1674464 | 2005664;2017568;2001824;2008384;2004544 | |
296 | densenet0_stage3_relu45_fwd | Activation | [32,128,14,14] | 220.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 14453.33 | 1594432.00 | 77.10 | 1.00 | 229.38 | true | 0.771272;0.776134;0.766973;0.768722;0.772795 | 1605632;1605632;1605632;1605632;1605632 | 14240;14368;14368;14624;15136 | 1595296;1588192;1592992;1595008;1598240 | |
297 | densenet0_stage3_conv45_fwd | Convolution | [32,128,14,14] | 39543.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1035754.67 | 12.50 | 271.95 | 7824.27 | false | 0.124828;0.124829;0.124824;0.124823;0.124835 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;5888 | 1030080;1039328;1037856;1041088;1026400 | |
297 | densenet0_stage3_conv45_fwd | Convolution | [32,128,14,14] | 39543.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 246474.67 | 6.20 | 0.60 | 50.90 | true | 0.062373;0.062359;0.062359;0.062354;0.062357 | 237568;237568;237568;237568;237568 | 251488;237056;245152;247552;246720 | 152768;147648;147648;147648;147648 | |
298 | densenet0_stage3_concat22 | Concat | [32,960,14,14] | 1822 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 45.14 | 0 | 12443770.67 | 13031930.67 | 73.90 | 0.00 | 0.00 | true | 0.857967;0.614836;0.858610;0.612111;0.857551;0.620475;0.856924;0.616038;0.858939;0.623317 | 0;0;0;0;0;0;0;0;0;0 | 25136256;920256;25150816;919552;25154720;899168;25154784;911744;25152960;896928 | 24084544;802976;24084608;803040;24084544;802976;24086336;802976;24084544;802976 | |
298 | densenet0_stage3_concat22 | Concat | [32,960,14,14] | 1822 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 35.43 | 0 | 12443770.67 | 13031930.67 | 73.90 | 0.00 | 0.00 | true | 0.857967;0.614836;0.858610;0.612111;0.857551;0.620475;0.856924;0.616038;0.858939;0.623317 | 0;0;0;0;0;0;0;0;0;0 | 25136256;920256;25150816;919552;25154720;899168;25154784;911744;25152960;896928 | 24084544;802976;24084608;803040;24084544;802976;24086336;802976;24084544;802976 | |
299 | densenet0_stage3_batchnorm46_fwd | BatchNorm | [32,992,14,14] | 4506.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 71.67 | 39362560 | 24985685.33 | 26939626.67 | 85.90 | 0.76 | 549.24 | true | 0.859863;0.856273;0.861467;0.858962;0.859386 | 39362560;39362560;39362560;39362560;39362560 | 26934048;26936512;26953440;26940736;26941632 | 24988928;24985504;24986432;24985120;24982592 | |
300 | densenet0_stage3_relu46_fwd | Activation | [32,992,14,14] | 1299.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 68.00 | 12443648 | 24887648.00 | 24868202.67 | 94.50 | 0.25 | 182.99 | true | 0.945031;0.943298;0.945529;0.946068;0.944470 | 12443648;12443648;12443648;12443648;12443648 | 24869216;24867936;24863840;24867456;24874400 | 24887680;24887648;24887648;24887648;24887648 | |
301 | densenet0_stage3_conv46_fwd | Convolution | [32,992,14,14] | 93974 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 207.00 | 1594392576 | 38650912.00 | 5083605.33 | 8.30 | 36.46 | 7702.38 | false | 0.082503;0.082564;0.082569;0.082646;0.082548 | 1594392576;1594392576;1594392576;1594392576;1594392576 | 38642944;38411872;38682592;38688736;38627200 | 5079616;5087456;5078752;5083744;5095104 | |
301 | densenet0_stage3_conv46_fwd | Convolution | [32,992,14,14] | 93974 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058089;0.058077;0.057926;0.058072;0.057905 | 0;0;0;0;0 | 96;96;96;96;96 | 2560;2432;2432;2432;2432 | |
302 | densenet0_stage3_batchnorm47_fwd | BatchNorm | [32,128,14,14] | 785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.33 | 5079040 | 1777973.33 | 1851850.67 | 42.80 | 1.40 | 411.83 | true | 0.440008;0.422803;0.426767;0.428527;0.429488 | 5079040;5079040;5079040;5079040;5079040 | 1771296;1774848;1785664;1773408;1793760 | 1871264;1850464;1849760;1855328;1849600 | |
303 | densenet0_stage3_relu47_fwd | Activation | [32,128,14,14] | 218.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 15477.33 | 1625482.67 | 77.50 | 0.98 | 240.83 | true | 0.775120;0.774313;0.776515;0.776602;0.774582 | 1605632;1605632;1605632;1605632;1605632 | 15008;28576;14624;16032;15392 | 1603424;1663072;1629824;1612224;1634400 | |
304 | densenet0_stage3_conv47_fwd | Convolution | [32,128,14,14] | 39592.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 281673728 | 0.00 | 1106144.00 | 12.50 | 254.64 | 7824.27 | false | 0.124827;0.124827;0.124827;0.124835;0.124832 | 281673728;281673728;281673728;281673728;281673728 | 0;0;0;0;0 | 1102400;1105440;1104960;1111008;1108032 | |
304 | densenet0_stage3_conv47_fwd | Convolution | [32,128,14,14] | 39592.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 172693.33 | 6.20 | 0.74 | 50.90 | true | 0.062357;0.062353;0.062360;0.062342;0.062358 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 186560;158304;172960;173472;171648 | |
305 | densenet0_stage3_concat23 | Concat | [32,992,14,14] | 1539.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 46.29 | 0 | 12845184.00 | 13443813.33 | 74.20 | 0.00 | 0.00 | true | 0.859289;0.610082;0.859089;0.622906;0.859859;0.626401;0.859646;0.621479;0.858269;0.625193 | 0;0;0;0;0;0;0;0;0;0 | 25961056;935936;25949344;943200;25949440;937408;25947552;932640;25956544;929440 | 24887360;803008;24887360;803008;24887360;802976;24887360;803008;24887360;803008 | |
305 | densenet0_stage3_concat23 | Concat | [32,992,14,14] | 1539.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 36.43 | 0 | 12845184.00 | 13443813.33 | 74.20 | 0.00 | 0.00 | true | 0.859289;0.610082;0.859089;0.622906;0.859859;0.626401;0.859646;0.621479;0.858269;0.625193 | 0;0;0;0;0;0;0;0;0;0 | 25961056;935936;25949344;943200;25949440;937408;25947552;932640;25956544;929440 | 24887360;803008;24887360;803008;24887360;802976;24887360;803008;24887360;803008 | |
306 | densenet0_batchnorm3_fwd | BatchNorm | [32,1024,14,14] | 3440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 74.00 | 40632320 | 25803232.00 | 27804373.33 | 85.90 | 0.76 | 549.09 | true | 0.858520;0.858372;0.859630;0.860176;0.860080 | 40632320;40632320;40632320;40632320;40632320 | 25800768;25803584;25803328;25806720;25802784 | 27805728;27803552;27800096;27816128;27803840 | |
307 | densenet0_relu3_fwd | Activation | [32,1024,14,14] | 1432 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25676490.67 | 94.50 | 0.25 | 184.38 | true | 0.943165;0.946727;0.945838;0.945815;0.944776 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690720;25690464;25690464 | 25679584;25673824;25668704;25676064;25680096 | |
308 | densenet0_conv3_fwd | Convolution | [32,1024,14,14] | 361489.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 725.00 | 6579879936 | 832.00 | 3744.00 | 31.90 | 1437910.83 | 9075.70 | false | 0.319103;0.319587;0.319897;0.317962;0.317923 | 6579879936;6579879936;6579879936;6579879936;6579879936 | 832;832;832;832;33472 | 3744;3744;3744;3744;88128 | |
309 | densenet0_pool3_fwd | Pooling | [32,512,14,14] | 8924.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 32.33 | 16859136 | 11456650.67 | 5329514.67 | 51.50 | 1.00 | 521.42 | true | 0.515885;0.517545;0.514293;0.515971;0.513357 | 16859136;16859136;16859136;16859136;16859136 | 11342656;11422400;11441952;11505600;11534176 | 5383328;5356672;5329408;5205664;5302464 | |
310 | densenet0_stage4_batchnorm0_fwd | BatchNorm | [32,512,7,7] | 865 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 30.67 | 5619712 | 1686837.33 | 2037930.67 | 17.90 | 1.51 | 183.25 | true | 0.177753;0.173714;0.197574;0.180879;0.178448 | 5619712;5619712;5619712;5619712;5619712 | 2005920;2030304;2041472;2052928;2042016 | 1825120;1716832;1665632;1616736;1678048 | |
311 | densenet0_stage4_relu0_fwd | Activation | [32,512,7,7] | 226 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 6346.67 | 1571413.33 | 77.80 | 1.02 | 229.38 | true | 0.778589;0.791904;0.774467;0.778498;0.776172 | 1605632;1605632;1605632;1605632;1605632 | 11552;7712;4128;6176;5152 | 1563168;1567968;1585056;1583104;1527808 | |
312 | densenet0_stage4_conv0_fwd | Convolution | [32,512,7,7] | 14471 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 57.00 | 205721600 | 290005.33 | 1086549.33 | 7.10 | 149.45 | 3609.15 | false | 0.071301;0.070172;0.071148;0.070901;0.070205 | 205721600;205721600;205721600;205721600;205721600 | 288192;292160;291776;285568;290048 | 1077312;1085952;1096384;1073984;1096416 | |
313 | densenet0_stage4_batchnorm1_fwd | BatchNorm | [32,128,7,7] | 175.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.33 | 1404928 | 2304.00 | 643733.33 | 12.40 | 2.17 | 135.97 | true | 0.122596;0.126281;0.124604;0.123482;0.123421 | 1404928;1404928;1404928;1404928;1404928 | 2304;2304;2304;2304;2304 | 639872;643584;662784;647744;637632 | |
314 | densenet0_stage4_relu1_fwd | Activation | [32,128,7,7] | 111 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 224.00 | 3413.33 | 55.10 | 110.36 | 80.28 | false | 0.551383;0.552160;0.549875;0.550407;0.550335 | 401408;401408;401408;401408;401408 | 224;224;224;224;224 | 3584;3200;3584;3200;3456 | |
315 | densenet0_stage4_conv1_fwd | Convolution | [32,128,7,7] | 10971 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 1877.33 | 261749.33 | 12.50 | 534.23 | 4023.91 | false | 0.124899;0.124900;0.124898;0.124900;0.124891 | 140836864;140836864;140836864;140836864;140836864 | 262176;260896;261728;261856;261664 | 1280;1792;2048;2048;1792 | |
315 | densenet0_stage4_conv1_fwd | Convolution | [32,128,7,7] | 10971 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147712.00 | 1962.67 | 6.20 | 1.59 | 47.51 | true | 0.062389;0.062389;0.062385;0.062390;0.062392 | 237568;237568;237568;237568;237568 | 147456;147712;147712;147712;147712 | 1792;1920;1920;2048;2048 | |
316 | densenet0_stage4_concat0 | Concat | [32,512,7,7] | 321 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.57 | 0 | 1609594.67 | 1578096.00 | 51.50 | 0.00 | 0.00 | true | 0.778817;0.256016;0.782154;0.250032;0.778456;0.250793;0.783510;0.251317;0.776927;0.250625 | 0;0;0;0;0;0;0;0;0;0 | 3141888;58400;3173184;54048;3176512;55328;3170112;49568;3173376;58656 | 2855552;311424;2841216;317440;2843840;309152;2843584;311072;2845536;303392 | |
316 | densenet0_stage4_concat0 | Concat | [32,512,7,7] | 321 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.43 | 0 | 1609594.67 | 1578096.00 | 51.50 | 0.00 | 0.00 | true | 0.778817;0.256016;0.782154;0.250032;0.778456;0.250793;0.783510;0.251317;0.776927;0.250625 | 0;0;0;0;0;0;0;0;0;0 | 3141888;58400;3173184;54048;3176512;55328;3170112;49568;3173376;58656 | 2855552;311424;2841216;317440;2843840;309152;2843584;311072;2845536;303392 | |
317 | densenet0_stage4_batchnorm2_fwd | BatchNorm | [32,544,7,7] | 507.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 32.33 | 5970944 | 2217397.33 | 3814581.33 | 17.80 | 0.99 | 184.67 | true | 0.177824;0.183276;0.176982;0.177378;0.177658 | 5970944;5970944;5970944;5970944;5970944 | 2239904;2230816;2213024;2185568;2208352 | 3814432;3811008;3817856;3816512;3812800 | |
318 | densenet0_stage4_relu2_fwd | Activation | [32,544,7,7] | 275.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.33 | 1705984 | 64010.67 | 2202272.00 | 78.40 | 0.75 | 232.64 | true | 0.787792;0.784432;0.785611;0.779620;0.782188 | 1705984;1705984;1705984;1705984;1705984 | 62112;61792;60832;68128;68256 | 2215200;2184224;2186272;2230848;2205344 | |
319 | densenet0_stage4_conv2_fwd | Convolution | [32,544,7,7] | 15198 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 57.33 | 218566656 | 282453.33 | 1070581.33 | 6.90 | 161.54 | 3812.23 | false | 0.070211;0.069972;0.068964;0.069377;0.069058 | 218566656;218566656;218566656;218566656;218566656 | 285024;279904;283392;280512;283456 | 1075776;1067936;1071968;1060576;1071840 | |
320 | densenet0_stage4_batchnorm3_fwd | BatchNorm | [32,128,7,7] | 178.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2048.00 | 811797.33 | 12.40 | 1.73 | 127.72 | true | 0.123195;0.122220;0.132540;0.122529;0.125973 | 1404928;1404928;1404928;1404928;1404928 | 2048;4096;2048;2048;2048 | 809056;817632;805376;808704;818144 | |
321 | densenet0_stage4_relu3_fwd | Activation | [32,128,7,7] | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 224.00 | 20469.33 | 55.80 | 19.40 | 86.01 | false | 0.548730;0.564044;0.569519;0.559794;0.551655 | 401408;401408;401408;401408;401408 | 224;5344;224;224;224 | 20128;21408;21024;19744;20256 | |
322 | densenet0_stage4_conv3_fwd | Convolution | [32,128,7,7] | 10964 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 42.67 | 261834.67 | 12.50 | 537.80 | 4062.56 | false | 0.124900;0.124900;0.124898;0.124897;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 64;0;0;2048;64 | 261952;261632;261920;256960;262560 | |
322 | densenet0_stage4_conv3_fwd | Convolution | [32,128,7,7] | 10964 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1546.67 | 6.20 | 1.59 | 54.83 | true | 0.062337;0.062337;0.062340;0.062328;0.062325 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 1696;1536;1536;1568;1536 | |
323 | densenet0_stage4_concat1 | Concat | [32,544,7,7] | 311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.14 | 0 | 1778469.33 | 1596650.67 | 52.20 | 0.00 | 0.00 | true | 0.783737;0.259353;0.783855;0.259701;0.783571;0.264507;0.781632;0.261127;0.784701;0.260280 | 0;0;0;0;0;0;0;0;0;0 | 3411968;139680;3411968;151200;3411968;144032;3412032;138656;3412032;134816 | 2947200;241568;2955712;244384;2943072;247968;2957408;233472;2966304;234560 | |
323 | densenet0_stage4_concat1 | Concat | [32,544,7,7] | 311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.86 | 0 | 1778469.33 | 1596650.67 | 52.20 | 0.00 | 0.00 | true | 0.783737;0.259353;0.783855;0.259701;0.783571;0.264507;0.781632;0.261127;0.784701;0.260280 | 0;0;0;0;0;0;0;0;0;0 | 3411968;139680;3411968;151200;3411968;144032;3412032;138656;3412032;134816 | 2947200;241568;2955712;244384;2943072;247968;2957408;233472;2966304;234560 | |
324 | densenet0_stage4_batchnorm4_fwd | BatchNorm | [32,576,7,7] | 500 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 34.00 | 6322176 | 3062602.67 | 4040245.33 | 18.90 | 0.89 | 185.95 | true | 0.186624;0.187713;0.190987;0.194054;0.189384 | 6322176;6322176;6322176;6322176;6322176 | 3068256;3027680;3051104;3068448;3078816 | 4046400;4034848;4041664;4039936;4039136 | |
325 | densenet0_stage4_relu4_fwd | Activation | [32,576,7,7] | 290 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1806336 | 174666.67 | 2619072.00 | 80.40 | 0.65 | 258.05 | true | 0.807297;0.801279;0.799739;0.807430;0.804031 | 1806336;1806336;1806336;1806336;1806336 | 2649152;2570208;2600672;2624960;2631584 | 173088;177952;175776;175136;172960 | |
326 | densenet0_stage4_conv4_fwd | Convolution | [32,576,7,7] | 15909 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 60.67 | 231411712 | 298186.67 | 1109237.33 | 6.90 | 164.42 | 3814.46 | false | 0.069104;0.069393;0.070619;0.069346;0.069342 | 231411712;231411712;231411712;231411712;231411712 | 297760;296928;297696;299872;299104 | 1113376;1110816;1103520;1125856;1096928 | |
327 | densenet0_stage4_batchnorm5_fwd | BatchNorm | [32,128,7,7] | 179 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2112.00 | 811744.00 | 12.40 | 1.73 | 127.72 | true | 0.124365;0.124725;0.124104;0.125151;0.123620 | 1404928;1404928;1404928;1404928;1404928 | 2112;2048;2112;7232;2112 | 812288;805664;813536;809504;813440 | |
328 | densenet0_stage4_relu5_fwd | Activation | [32,128,7,7] | 111 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 224.00 | 19082.67 | 55.10 | 20.79 | 80.28 | false | 0.549232;0.553846;0.560411;0.550482;0.549531 | 401408;401408;401408;401408;401408 | 224;224;224;224;224 | 18016;21408;19488;18464;19296 | |
329 | densenet0_stage4_conv5_fwd | Convolution | [32,128,7,7] | 10999.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 261664.00 | 12.50 | 538.24 | 4062.56 | false | 0.124900;0.124900;0.124900;0.124898;0.124901 | 140836864;140836864;140836864;140836864;140836864 | 261184;261408;260928;262400;265088 | 0;0;0;0;0 | |
329 | densenet0_stage4_conv5_fwd | Convolution | [32,128,7,7] | 10999.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1557.33 | 6.20 | 1.59 | 54.83 | true | 0.062338;0.062339;0.062327;0.062337;0.062337 | 237568;237568;237568;237568;237568 | 1536;1536;2048;1536;1600 | 147456;147456;147712;147456;147456 | |
330 | densenet0_stage4_concat2 | Concat | [32,576,7,7] | 318 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.14 | 0 | 1890170.67 | 1745648.00 | 52.40 | 0.00 | 0.00 | true | 0.789364;0.255343;0.787718;0.259863;0.787440;0.258968;0.788389;0.259183;0.788892;0.259320 | 0;0;0;0;0;0;0;0;0;0 | 3232736;266112;3237184;263936;3218592;263296;3224256;268256;3235616;262304 | 3614784;165024;3612736;167328;3612736;160928;3612736;170528;3612672;164512 | |
330 | densenet0_stage4_concat2 | Concat | [32,576,7,7] | 318 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.86 | 0 | 1890170.67 | 1745648.00 | 52.40 | 0.00 | 0.00 | true | 0.789364;0.255343;0.787718;0.259863;0.787440;0.258968;0.788389;0.259183;0.788892;0.259320 | 0;0;0;0;0;0;0;0;0;0 | 3232736;266112;3237184;263936;3218592;263296;3224256;268256;3235616;262304 | 3614784;165024;3612736;167328;3612736;160928;3612736;170528;3612672;164512 | |
331 | densenet0_stage4_batchnorm6_fwd | BatchNorm | [32,608,7,7] | 550 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 35.00 | 6673408 | 3632309.33 | 4347648.00 | 19.50 | 0.84 | 190.67 | true | 0.192488;0.197184;0.194419;0.195017;0.196334 | 6673408;6673408;6673408;6673408;6673408 | 3636640;3631776;3629088;3634720;3630432 | 4360640;4342112;4347136;4347744;4348064 | |
332 | densenet0_stage4_relu6_fwd | Activation | [32,608,7,7] | 302 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 1906688 | 323978.67 | 3071594.67 | 83.30 | 0.56 | 238.34 | true | 0.829735;0.830333;0.833026;0.847573;0.834788 | 1906688;1906688;1906688;1906688;1906688 | 317984;322080;330400;321632;328224 | 3069408;3104640;3092864;3027648;3052512 | |
333 | densenet0_stage4_conv6_fwd | Convolution | [32,608,7,7] | 17282 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 63.00 | 244256768 | 315797.33 | 1130208.00 | 6.90 | 168.92 | 3877.09 | false | 0.069932;0.069212;0.067346;0.069381;0.067872 | 244256768;244256768;244256768;244256768;244256768 | 314176;315808;317408;313344;320064 | 1128320;1132448;1134880;1123104;1129856 | |
334 | densenet0_stage4_batchnorm7_fwd | BatchNorm | [32,128,7,7] | 179.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2048.00 | 809002.67 | 12.40 | 1.73 | 127.72 | true | 0.124490;0.124288;0.124140;0.124532;0.125336 | 1404928;1404928;1404928;1404928;1404928 | 3072;2048;2048;2048;2048 | 812224;810112;807424;809472;803488 | |
335 | densenet0_stage4_relu7_fwd | Activation | [32,128,7,7] | 106.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 224.00 | 20640.00 | 56.00 | 19.24 | 86.01 | false | 0.557028;0.562404;0.547892;0.561296;0.575076 | 401408;401408;401408;401408;401408 | 224;224;224;224;224 | 20256;20832;20832;20896;20192 | |
336 | densenet0_stage4_conv7_fwd | Convolution | [32,128,7,7] | 11047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.33 | 140836864 | 64.00 | 262058.67 | 12.50 | 537.29 | 4102.08 | false | 0.124901;0.124900;0.124900;0.124895;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 64;64;64;64;64 | 261408;262016;262912;262176;261984 | |
336 | densenet0_stage4_conv7_fwd | Convolution | [32,128,7,7] | 11047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 1536.00 | 6.20 | 1.59 | 59.39 | true | 0.062327;0.062323;0.062335;0.062330;0.062338 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 512;1536;1536;1536;1536 | |
337 | densenet0_stage4_concat3 | Concat | [32,608,7,7] | 336.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.57 | 0 | 1995952.00 | 1837994.67 | 52.60 | 0.00 | 0.00 | true | 0.793216;0.256334;0.791517;0.259151;0.805490;0.257244;0.793551;0.259317;0.795516;0.256910 | 0;0;0;0;0;0;0;0;0;0 | 3813440;181152;3813440;177056;3813440;175392;3813440;174624;3813440;177184 | 3411008;271264;3420288;254272;3440768;248960;3428640;256256;3414880;238848 | |
337 | densenet0_stage4_concat3 | Concat | [32,608,7,7] | 336.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.14 | 0 | 1995952.00 | 1837994.67 | 52.60 | 0.00 | 0.00 | true | 0.793216;0.256334;0.791517;0.259151;0.805490;0.257244;0.793551;0.259317;0.795516;0.256910 | 0;0;0;0;0;0;0;0;0;0 | 3813440;181152;3813440;177056;3813440;175392;3813440;174624;3813440;177184 | 3411008;271264;3420288;254272;3440768;248960;3428640;256256;3414880;238848 | |
338 | densenet0_stage4_batchnorm8_fwd | BatchNorm | [32,640,7,7] | 553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.00 | 7024640 | 3857738.67 | 4585525.33 | 19.90 | 0.83 | 189.86 | true | 0.208234;0.199672;0.194692;0.199699;0.197927 | 7024640;7024640;7024640;7024640;7024640 | 4572544;4585568;4588896;4603072;4582112 | 3855776;3858592;3858848;3855392;3866016 | |
339 | densenet0_stage4_relu8_fwd | Activation | [32,640,7,7] | 319.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.33 | 2007040 | 470816.00 | 3348373.33 | 84.50 | 0.53 | 240.85 | true | 0.853395;0.845437;0.839030;0.845489;0.844533 | 2007040;2007040;2007040;2007040;2007040 | 473248;471968;466080;467232;482848 | 3407392;3386592;3317184;3278240;3341344 | |
340 | densenet0_stage4_conv8_fwd | Convolution | [32,640,7,7] | 17896 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.00 | 257101824 | 331349.33 | 1147402.67 | 7.00 | 173.86 | 3895.48 | false | 0.068376;0.069589;0.070123;0.069969;0.070034 | 257101824;257101824;257101824;257101824;257101824 | 331104;333440;327936;332576;330368 | 1146016;1136832;1132960;1169440;1159360 | |
341 | densenet0_stage4_batchnorm9_fwd | BatchNorm | [32,128,7,7] | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2069.33 | 803882.67 | 12.50 | 1.74 | 131.71 | true | 0.123742;0.124652;0.124608;0.125670;0.124925 | 1404928;1404928;1404928;1404928;1404928 | 2112;2048;2048;2048;2112 | 799712;813728;809408;802528;796032 | |
342 | densenet0_stage4_relu9_fwd | Activation | [32,128,7,7] | 108.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 266.67 | 18880.00 | 55.00 | 20.96 | 86.01 | false | 0.550364;0.546220;0.568754;0.551541;0.548963 | 401408;401408;401408;401408;401408 | 288;6944;288;224;224 | 19040;19168;17696;20896;18432 | |
343 | densenet0_stage4_conv9_fwd | Convolution | [32,128,7,7] | 11006 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 260736.00 | 12.50 | 540.15 | 4023.91 | false | 0.124899;0.124900;0.124898;0.124899;0.124900 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 260736;260640;260832;257760;261728 | |
343 | densenet0_stage4_conv9_fwd | Convolution | [32,128,7,7] | 11006 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 1664.00 | 6.20 | 1.59 | 59.39 | true | 0.062332;0.062337;0.062336;0.062340;0.062328 | 237568;237568;237568;237568;237568 | 147456;147456;147456;149760;147456 | 640;1664;1664;4736;1664 | |
344 | densenet0_stage4_concat4 | Concat | [32,640,7,7] | 341 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.86 | 0 | 2105584.00 | 1950853.33 | 52.80 | 0.00 | 0.00 | true | 0.790681;0.257702;0.795673;0.264883;0.793985;0.259626;0.797767;0.259884;0.796000;0.261990 | 0;0;0;0;0;0;0;0;0;0 | 4014144;196640;4014144;194464;4014144;196640;4014144;196896;4014144;197536 | 3608864;293376;3625056;264288;3610976;286464;3608448;296992;3621344;280800 | |
344 | densenet0_stage4_concat4 | Concat | [32,640,7,7] | 341 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.43 | 0 | 2105584.00 | 1950853.33 | 52.80 | 0.00 | 0.00 | true | 0.790681;0.257702;0.795673;0.264883;0.793985;0.259626;0.797767;0.259884;0.796000;0.261990 | 0;0;0;0;0;0;0;0;0;0 | 4014144;196640;4014144;194464;4014144;196640;4014144;196896;4014144;197536 | 3608864;293376;3625056;264288;3610976;286464;3608448;296992;3621344;280800 | |
345 | densenet0_stage4_batchnorm10_fwd | BatchNorm | [32,672,7,7] | 590.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 38.67 | 7375872 | 4081440.00 | 4841504.00 | 20.20 | 0.83 | 190.75 | true | 0.198981;0.198609;0.202935;0.205947;0.203984 | 7375872;7375872;7375872;7375872;7375872 | 4080928;4081568;4081568;4081312;4081440 | 4843104;4847456;4839584;4835808;4841824 | |
346 | densenet0_stage4_relu10_fwd | Activation | [32,672,7,7] | 332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.33 | 2107392 | 712736.00 | 3713034.67 | 85.90 | 0.48 | 225.80 | true | 0.866310;0.859254;0.860572;0.855785;0.857006 | 2107392;2107392;2107392;2107392;2107392 | 704864;698912;730272;723040;710304 | 3746944;3698368;3719200;3719552;3700352 | |
347 | densenet0_stage4_conv10_fwd | Convolution | [32,672,7,7] | 18477.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 69.67 | 269946880 | 346421.33 | 1150304.00 | 6.90 | 180.36 | 3874.82 | false | 0.070736;0.069399;0.069097;0.069439;0.068289 | 269946880;269946880;269946880;269946880;269946880 | 349088;346560;347904;344800;344192 | 1156608;1144896;1147136;1149504;1154272 | |
348 | densenet0_stage4_batchnorm11_fwd | BatchNorm | [32,128,7,7] | 183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2069.33 | 804565.33 | 12.30 | 1.74 | 131.71 | true | 0.122777;0.122281;0.125374;0.121576;0.123228 | 1404928;1404928;1404928;1404928;1404928 | 2112;2048;2048;2048;2112 | 801440;811584;810464;801152;801792 | |
349 | densenet0_stage4_relu11_fwd | Activation | [32,128,7,7] | 115 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 245.33 | 19637.33 | 56.50 | 20.19 | 80.28 | false | 0.560424;0.565060;0.550416;0.583464;0.570151 | 401408;401408;401408;401408;401408 | 224;224;224;288;288 | 19200;19552;19552;20480;19808 | |
350 | densenet0_stage4_conv11_fwd | Convolution | [32,128,7,7] | 11000.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 260010.67 | 12.50 | 541.66 | 4062.56 | false | 0.124898;0.124898;0.124901;0.124899;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 259680;260032;260000;260000;260320 | |
350 | densenet0_stage4_conv11_fwd | Convolution | [32,128,7,7] | 11000.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1536.00 | 6.20 | 1.59 | 54.83 | true | 0.062331;0.062331;0.062327;0.062330;0.062322 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 1536;1536;1504;1536;1536 | |
351 | densenet0_stage4_concat5 | Concat | [32,672,7,7] | 349.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.29 | 0 | 2208538.67 | 2069813.33 | 53.10 | 0.00 | 0.00 | true | 0.797284;0.262299;0.798347;0.262267;0.802634;0.262670;0.798124;0.265308;0.803069;0.266428 | 0;0;0;0;0;0;0;0;0;0 | 4214848;206368;4214848;200224;4214848;200096;4214848;199968;4214848;199840 | 3857280;296704;3857632;280192;3841728;293312;3849504;275456;3843744;293888 | |
351 | densenet0_stage4_concat5 | Concat | [32,672,7,7] | 349.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.86 | 0 | 2208538.67 | 2069813.33 | 53.10 | 0.00 | 0.00 | true | 0.797284;0.262299;0.798347;0.262267;0.802634;0.262670;0.798124;0.265308;0.803069;0.266428 | 0;0;0;0;0;0;0;0;0;0 | 3857280;296704;3857632;280192;3841728;293312;3849504;275456;3843744;293888 | 4214848;206368;4214848;200224;4214848;200096;4214848;199968;4214848;199840 | |
352 | densenet0_stage4_batchnorm12_fwd | BatchNorm | [32,704,7,7] | 596.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 40.00 | 7727104 | 4289610.67 | 5053269.33 | 19.80 | 0.83 | 193.18 | true | 0.212375;0.196468;0.199534;0.197682;0.197678 | 7727104;7727104;7727104;7727104;7727104 | 4289184;4289312;4289824;4290400;4289696 | 5045120;5058624;5069920;5041536;5056064 | |
353 | densenet0_stage4_relu12_fwd | Activation | [32,704,7,7] | 342.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.00 | 2207744 | 862261.33 | 3925184.00 | 86.90 | 0.46 | 245.30 | true | 0.865242;0.869641;0.872693;0.871429;0.866727 | 2207744;2207744;2207744;2207744;2207744 | 3919360;3897440;3950176;3972480;3906016 | 864800;854944;858976;863520;864288 | |
354 | densenet0_stage4_conv12_fwd | Convolution | [32,704,7,7] | 19847.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 71.67 | 282791936 | 363616.00 | 1160874.67 | 6.90 | 185.50 | 3945.92 | false | 0.067626;0.069571;0.069362;0.070086;0.068521 | 282791936;282791936;282791936;282791936;282791936 | 371680;364384;360768;362848;363616 | 1161184;1162528;1161440;1154464;1160000 | |
355 | densenet0_stage4_batchnorm13_fwd | BatchNorm | [32,128,7,7] | 179.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2154.67 | 798069.33 | 13.00 | 1.76 | 131.71 | true | 0.129814;0.129761;0.130787;0.132491;0.129398 | 1404928;1404928;1404928;1404928;1404928 | 791776;795296;800736;809504;798176 | 2208;2208;2208;2048;2048 | |
356 | densenet0_stage4_relu13_fwd | Activation | [32,128,7,7] | 109.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 266.67 | 21653.33 | 56.10 | 18.31 | 80.28 | false | 0.562286;0.567190;0.558771;0.560302;0.559156 | 401408;401408;401408;401408;401408 | 24352;23840;21536;18848;19584 | 288;288;288;224;224 | |
357 | densenet0_stage4_conv13_fwd | Convolution | [32,128,7,7] | 10989.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 258954.67 | 12.50 | 543.87 | 4023.91 | false | 0.124900;0.124901;0.124898;0.124900;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;6656 | 259040;258528;259296;260160;257248 | |
357 | densenet0_stage4_conv13_fwd | Convolution | [32,128,7,7] | 10989.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1536.00 | 6.20 | 1.59 | 54.83 | true | 0.062341;0.062335;0.062334;0.062341;0.062326 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;149504 | 1536;1536;512;1536;5088 | |
358 | densenet0_stage4_concat6 | Concat | [32,704,7,7] | 358.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.71 | 0 | 2308186.67 | 2177253.33 | 53.30 | 0.00 | 0.00 | true | 0.800035;0.263732;0.801106;0.262744;0.799275;0.266424;0.801159;0.267742;0.798446;0.266791 | 0;0;0;0;0;0;0;0;0;0 | 4095552;262112;4076992;275552;4095520;271072;4080608;270720;4080512;278784 | 4415552;200864;4415552;200736;4415552;200736;4415552;200864;4415552;200736 | |
358 | densenet0_stage4_concat6 | Concat | [32,704,7,7] | 358.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.00 | 0 | 2308186.67 | 2177253.33 | 53.30 | 0.00 | 0.00 | true | 0.800035;0.263732;0.801106;0.262744;0.799275;0.266424;0.801159;0.267742;0.798446;0.266791 | 0;0;0;0;0;0;0;0;0;0 | 4415552;200864;4415552;200736;4415552;200736;4415552;200864;4415552;200736 | 4095552;262112;4076992;275552;4095520;271072;4080608;270720;4080512;278784 | |
359 | densenet0_stage4_batchnorm14_fwd | BatchNorm | [32,736,7,7] | 633.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 43.00 | 8078336 | 4497418.67 | 5271680.00 | 19.70 | 0.83 | 187.87 | true | 0.194460;0.199198;0.192738;0.197518;0.199446 | 8078336;8078336;8078336;8078336;8078336 | 4497728;4497344;4497728;4497056;4497184 | 5267584;5266816;5256352;5280640;5285952 | |
360 | densenet0_stage4_relu14_fwd | Activation | [32,736,7,7] | 353.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 2308096 | 1186037.33 | 4225013.33 | 87.20 | 0.43 | 209.83 | true | 0.875505;0.870595;0.868833;0.871831;0.874124 | 2308096;2308096;2308096;2308096;2308096 | 1190048;1185440;1186336;1160352;1186336 | 4253120;4228608;4220128;4192192;4226304 | |
361 | densenet0_stage4_conv14_fwd | Convolution | [32,736,7,7] | 20539.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 75.00 | 295636992 | 380490.67 | 1177941.33 | 7.00 | 189.70 | 3941.83 | false | 0.070016;0.069525;0.069864;0.069505;0.070623 | 295636992;295636992;295636992;295636992;295636992 | 381152;377824;381472;378848;382048 | 1182752;1171872;1179200;1189088;1151648 | |
362 | densenet0_stage4_batchnorm15_fwd | BatchNorm | [32,128,7,7] | 182.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2048.00 | 797685.33 | 12.80 | 1.76 | 131.71 | true | 0.131196;0.130051;0.125426;0.127085;0.124904 | 1404928;1404928;1404928;1404928;1404928 | 2048;2048;2048;2048;2048 | 795968;805984;795936;792896;801152 | |
363 | densenet0_stage4_relu15_fwd | Activation | [32,128,7,7] | 110.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 19338.67 | 55.80 | 20.45 | 80.28 | false | 0.558944;0.557581;0.557345;0.586930;0.553921 | 401408;401408;401408;401408;401408 | 288;288;288;544;288 | 21504;19488;18112;20064;18464 | |
364 | densenet0_stage4_conv15_fwd | Convolution | [32,128,7,7] | 10993 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 258752.00 | 12.50 | 544.29 | 4023.91 | false | 0.124898;0.124898;0.124900;0.124900;0.124898 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 258400;259232;258784;258080;259072 | |
364 | densenet0_stage4_conv15_fwd | Convolution | [32,128,7,7] | 10993 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 1504.00 | 6.20 | 1.59 | 59.39 | true | 0.062330;0.062326;0.062329;0.062334;0.062323 | 237568;237568;237568;237568;237568 | 1536;1504;1504;1504;1504 | 147456;147456;147456;147456;147456 | |
365 | densenet0_stage4_concat7 | Concat | [32,736,7,7] | 372 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.57 | 0 | 2408560.00 | 2278576.00 | 53.30 | 0.00 | 0.00 | true | 0.811607;0.261681;0.807655;0.256330;0.806436;0.257270;0.809831;0.257195;0.811958;0.255538 | 0;0;0;0;0;0;0;0;0;0 | 4265152;294400;4262432;303616;4259040;283392;4254432;298752;4255456;300160 | 4616256;200352;4616256;200864;4616256;200736;4616256;200864;4616256;200864 | |
365 | densenet0_stage4_concat7 | Concat | [32,736,7,7] | 372 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.00 | 0 | 2408560.00 | 2278576.00 | 53.30 | 0.00 | 0.00 | true | 0.811607;0.261681;0.807655;0.256330;0.806436;0.257270;0.809831;0.257195;0.811958;0.255538 | 0;0;0;0;0;0;0;0;0;0 | 4265152;294400;4262432;303616;4259040;283392;4254432;298752;4255456;300160 | 4616256;200352;4616256;200864;4616256;200736;4616256;200864;4616256;200864 | |
366 | densenet0_stage4_batchnorm16_fwd | BatchNorm | [32,768,7,7] | 657.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 45.00 | 8429568 | 4703605.33 | 5588544.00 | 19.90 | 0.82 | 187.32 | true | 0.195586;0.204289;0.203316;0.195492;0.197805 | 8429568;8429568;8429568;8429568;8429568 | 4703648;4703648;4703520;4705568;4703392 | 5582400;5595200;5603616;5573152;5588032 | |
367 | densenet0_stage4_relu16_fwd | Activation | [32,768,7,7] | 366.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 2408448 | 1333237.33 | 4442016.00 | 88.60 | 0.42 | 218.95 | true | 0.888834;0.885342;0.888627;0.884438;0.884339 | 2408448;2408448;2408448;2408448;2408448 | 4438816;4440864;4446368;4458944;4425312 | 1342624;1330464;1309728;1330720;1338528 | |
368 | densenet0_stage4_conv16_fwd | Convolution | [32,768,7,7] | 21167.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 77.33 | 308482048 | 396629.33 | 1207658.67 | 6.90 | 192.29 | 3989.01 | false | 0.069764;0.070032;0.068118;0.069695;0.068899 | 308482048;308482048;308482048;308482048;308482048 | 400512;396608;394400;397248;396032 | 1200864;1215968;1197088;1206144;1237216 | |
369 | densenet0_stage4_batchnorm17_fwd | BatchNorm | [32,128,7,7] | 176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2101.33 | 797546.67 | 14.30 | 1.76 | 127.72 | true | 0.152724;0.168694;0.123825;0.152061;0.124694 | 1404928;1404928;1404928;1404928;1404928 | 2048;2048;2048;2208;2208 | 798208;796128;803264;798304;792768 | |
370 | densenet0_stage4_relu17_fwd | Activation | [32,128,7,7] | 108.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 20192.00 | 56.80 | 19.60 | 80.28 | false | 0.557894;0.561882;0.555215;0.583778;0.587422 | 401408;401408;401408;401408;401408 | 19392;20928;18816;22368;20256 | 288;288;288;288;288 | |
371 | densenet0_stage4_conv17_fwd | Convolution | [32,128,7,7] | 10999.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 257706.67 | 12.50 | 546.50 | 4023.91 | false | 0.124900;0.124900;0.124899;0.124898;0.124898 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 252288;258432;260032;255296;259392 | |
371 | densenet0_stage4_conv17_fwd | Convolution | [32,128,7,7] | 10999.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 148138.67 | 3018.67 | 6.20 | 1.57 | 47.51 | true | 0.062324;0.062339;0.062331;0.062344;0.062321 | 237568;237568;237568;237568;237568 | 151808;147456;147456;149504;147456 | 9312;1632;1664;5728;1664 | |
372 | densenet0_stage4_concat8 | Concat | [32,768,7,7] | 369.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.00 | 0 | 2508912.00 | 2368581.33 | 53.60 | 0.00 | 0.00 | true | 0.806238;0.264930;0.815143;0.262340;0.808898;0.262926;0.811280;0.257813;0.813061;0.263795 | 0;0;0;0;0;0;0;0;0;0 | 4823616;200864;4816960;200864;4816960;200864;4816960;200736;4816960;200864 | 4521728;226304;4482144;244864;4505152;243456;4480256;255616;4506592;239616 | |
372 | densenet0_stage4_concat8 | Concat | [32,768,7,7] | 369.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.29 | 0 | 2508912.00 | 2368581.33 | 53.60 | 0.00 | 0.00 | true | 0.806238;0.264930;0.815143;0.262340;0.808898;0.262926;0.811280;0.257813;0.813061;0.263795 | 0;0;0;0;0;0;0;0;0;0 | 4823616;200864;4816960;200864;4816960;200864;4816960;200736;4816960;200864 | 4521728;226304;4482144;244864;4505152;243456;4480256;255616;4506592;239616 | |
373 | densenet0_stage4_batchnorm18_fwd | BatchNorm | [32,800,7,7] | 677.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 46.33 | 8780800 | 4905088.00 | 5780096.00 | 19.30 | 0.82 | 189.52 | true | 0.222641;0.188103;0.195309;0.190223;0.194918 | 8780800;8780800;8780800;8780800;8780800 | 4904992;4905120;4905120;4905536;4905024 | 5781504;5770432;5790528;5788352;5762400 | |
374 | densenet0_stage4_relu18_fwd | Activation | [32,800,7,7] | 380.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 12.00 | 2508800 | 1620341.33 | 4697834.67 | 89.90 | 0.40 | 209.07 | true | 0.896877;0.901554;0.901994;0.897669;0.896826 | 2508800;2508800;2508800;2508800;2508800 | 1624544;1623392;1599840;1613088;1639648 | 4693888;4692736;4702336;4697280;4737792 | |
375 | densenet0_stage4_conv18_fwd | Convolution | [32,800,7,7] | 22452.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 80.67 | 321327104 | 414464.00 | 1225930.67 | 6.90 | 195.88 | 3983.38 | false | 0.069151;0.070363;0.069075;0.068498;0.069138 | 321327104;321327104;321327104;321327104;321327104 | 413120;416640;410624;414784;415488 | 1240864;1232512;1230528;1214752;1211328 | |
376 | densenet0_stage4_batchnorm19_fwd | BatchNorm | [32,128,7,7] | 178 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2154.67 | 793248.00 | 15.40 | 1.77 | 127.72 | true | 0.152696;0.151766;0.154280;0.157747;0.154503 | 1404928;1404928;1404928;1404928;1404928 | 2208;2208;2208;2048;2048 | 790080;787744;791808;797856;798464 | |
377 | densenet0_stage4_relu19_fwd | Activation | [32,128,7,7] | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 21013.33 | 55.90 | 18.84 | 80.28 | false | 0.557385;0.555779;0.560590;0.561298;0.558433 | 401408;401408;401408;401408;401408 | 288;288;288;288;288 | 21568;22432;20864;18912;20608 | |
378 | densenet0_stage4_conv19_fwd | Convolution | [32,128,7,7] | 11013.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.33 | 140836864 | 0.00 | 259029.33 | 12.50 | 543.71 | 4102.08 | false | 0.124900;0.124900;0.124900;0.124899;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 259168;258912;259008;260032;246336 | 0;0;0;0;0 | |
378 | densenet0_stage4_conv19_fwd | Convolution | [32,128,7,7] | 11013.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1525.33 | 6.20 | 1.59 | 54.83 | true | 0.062341;0.062329;0.062328;0.062341;0.062341 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;154112 | 1504;1472;1536;1536;15520 | |
379 | densenet0_stage4_concat9 | Concat | [32,800,7,7] | 404.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.57 | 0 | 2610117.33 | 2475109.33 | 53.60 | 0.00 | 0.00 | true | 0.811988;0.262489;0.818001;0.257204;0.812432;0.258572;0.815193;0.257763;0.813674;0.258110 | 0;0;0;0;0;0;0;0;0;0 | 5017664;200864;5017664;200864;5017664;200864;5017664;205984;5017664;200864 | 4676160;273888;4672096;286944;4661056;286176;4656288;292736;4667456;269312 | |
379 | densenet0_stage4_concat9 | Concat | [32,800,7,7] | 404.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.71 | 0 | 2610117.33 | 2475109.33 | 53.60 | 0.00 | 0.00 | true | 0.811988;0.262489;0.818001;0.257204;0.812432;0.258572;0.815193;0.257763;0.813674;0.258110 | 0;0;0;0;0;0;0;0;0;0 | 4676160;273888;4672096;286944;4661056;286176;4656288;292736;4667456;269312 | 5017664;200864;5017664;200864;5017664;200864;5017664;205984;5017664;200864 | |
380 | densenet0_stage4_batchnorm20_fwd | BatchNorm | [32,832,7,7] | 711 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 48.00 | 9132032 | 5115509.33 | 6065941.33 | 20.10 | 0.82 | 190.25 | true | 0.198932;0.204651;0.196168;0.199098;0.213346 | 9132032;9132032;9132032;9132032;9132032 | 5112608;5112608;5121312;5112608;5129248 | 6069504;6056448;6060512;6067808;6077056 | |
381 | densenet0_stage4_relu20_fwd | Activation | [32,832,7,7] | 481 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 13.33 | 2609152 | 1796725.33 | 4923861.33 | 89.10 | 0.39 | 195.69 | true | 0.890068;0.890042;0.894981;0.892235;0.890953 | 2609152;2609152;2609152;2609152;2609152 | 1804704;1795296;1804576;1780064;1790304 | 4931296;4928864;4937600;4911424;4906944 | |
382 | densenet0_stage4_conv20_fwd | Convolution | [32,832,7,7] | 23121 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 83.00 | 334172160 | 428042.67 | 1193888.00 | 7.00 | 206.03 | 4026.17 | false | 0.071196;0.070079;0.070556;0.070237;0.069556 | 334172160;334172160;334172160;334172160;334172160 | 427968;428544;431104;427392;427616 | 1188480;1194208;1187200;1211488;1198976 | |
383 | densenet0_stage4_batchnorm21_fwd | BatchNorm | [32,128,7,7] | 182 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2208.00 | 799701.33 | 15.30 | 1.75 | 127.72 | true | 0.150159;0.153752;0.153790;0.155629;0.152936 | 1404928;1404928;1404928;1404928;1404928 | 2464;2208;2208;2208;2208 | 803072;802752;797216;792736;799136 | |
384 | densenet0_stage4_relu21_fwd | Activation | [32,128,7,7] | 116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 21888.00 | 56.10 | 18.10 | 80.28 | false | 0.558039;0.564237;0.559812;0.564346;0.558652 | 401408;401408;401408;401408;401408 | 288;288;288;288;288 | 19872;21280;22912;21920;22464 | |
385 | densenet0_stage4_conv21_fwd | Convolution | [32,128,7,7] | 10984.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 254741.33 | 12.50 | 552.86 | 4023.91 | false | 0.124899;0.124898;0.124900;0.124899;0.124901 | 140836864;140836864;140836864;140836864;140836864 | 260224;234240;243104;261056;260896 | 0;3072;0;0;0 | |
385 | densenet0_stage4_conv21_fwd | Convolution | [32,128,7,7] | 10984.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147456.00 | 1621.33 | 6.20 | 1.59 | 50.90 | true | 0.062338;0.062331;0.062335;0.062345;0.062336 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 1664;1536;16864;1664;1536 | |
386 | densenet0_stage4_concat10 | Concat | [32,832,7,7] | 436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.57 | 0 | 2709616.00 | 2575226.67 | 53.70 | 0.00 | 0.00 | true | 0.813804;0.257447;0.813251;0.259458;0.810671;0.258431;0.814221;0.258026;0.814002;0.264361 | 0;0;0;0;0;0;0;0;0;0 | 4857888;290304;4861344;290176;4873952;271616;4868064;279552;4867744;283904 | 5218368;200864;5218368;200864;5225024;200864;5218368;200864;5218368;200864 | |
386 | densenet0_stage4_concat10 | Concat | [32,832,7,7] | 436 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.71 | 0 | 2709616.00 | 2575226.67 | 53.70 | 0.00 | 0.00 | true | 0.813804;0.257447;0.813251;0.259458;0.810671;0.258431;0.814221;0.258026;0.814002;0.264361 | 0;0;0;0;0;0;0;0;0;0 | 4857888;290304;4861344;290176;4873952;271616;4868064;279552;4867744;283904 | 5218368;200864;5218368;200864;5225024;200864;5218368;200864;5218368;200864 | |
387 | densenet0_stage4_batchnorm22_fwd | BatchNorm | [32,864,7,7] | 727.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 49.67 | 9483264 | 5320693.33 | 6243840.00 | 19.80 | 0.82 | 190.94 | true | 0.200047;0.196916;0.201688;0.198038;0.197263 | 9483264;9483264;9483264;9483264;9483264 | 5321120;5320096;5320608;5320352;5321888 | 6236704;6252832;6241984;6276352;6223136 | |
388 | densenet0_stage4_relu22_fwd | Activation | [32,864,7,7] | 502.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 13.33 | 2709504 | 2112032.00 | 5189504.00 | 91.00 | 0.37 | 203.22 | true | 0.907577;0.909764;0.910191;0.911046;0.910005 | 2709504;2709504;2709504;2709504;2709504 | 5191648;5183648;5193216;5173152;5193536 | 2107296;2110240;2118560;2088352;2142048 | |
389 | densenet0_stage4_conv22_fwd | Convolution | [32,864,7,7] | 23678.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.67 | 347017216 | 451488.00 | 1214976.00 | 7.00 | 208.24 | 3958.36 | false | 0.071089;0.069462;0.070387;0.069582;0.070215 | 347017216;347017216;347017216;347017216;347017216 | 453056;450240;452608;446368;451616 | 1226752;1215616;1204448;1206784;1222528 | |
390 | densenet0_stage4_batchnorm23_fwd | BatchNorm | [32,128,7,7] | 180.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2208.00 | 772362.67 | 15.40 | 1.81 | 131.71 | true | 0.153127;0.154844;0.151929;0.165151;0.155509 | 1404928;1404928;1404928;1404928;1404928 | 768640;774656;773472;773568;770048 | 2208;2208;2208;2208;2208 | |
391 | densenet0_stage4_relu23_fwd | Activation | [32,128,7,7] | 122 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 401408 | 288.00 | 24309.33 | 56.30 | 16.32 | 92.64 | true | 0.559152;0.568390;0.577273;0.561004;0.557605 | 401408;401408;401408;401408;401408 | 288;288;288;2592;288 | 24480;22752;24928;24736;23712 | |
392 | densenet0_stage4_conv23_fwd | Convolution | [32,128,7,7] | 10994.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 260309.33 | 12.50 | 541.04 | 4062.56 | false | 0.124899;0.124898;0.124900;0.124899;0.124898 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 261024;260000;259904;247392;261632 | |
392 | densenet0_stage4_conv23_fwd | Convolution | [32,128,7,7] | 10994.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 1578.67 | 6.20 | 1.59 | 59.39 | true | 0.062343;0.062334;0.062343;0.062346;0.062335 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 1536;1664;1536;15616;1536 | |
393 | densenet0_stage4_concat11 | Concat | [32,864,7,7] | 443.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.00 | 0 | 2809968.00 | 2677402.67 | 53.60 | 0.00 | 0.00 | true | 0.813569;0.254090;0.813728;0.255900;0.814110;0.261605;0.816501;0.253865;0.814299;0.252819 | 0;0;0;0;0;0;0;0;0;0 | 5419072;200864;5419072;200864;5419072;200864;5419072;200864;5423424;200864 | 5114624;239232;5114304;241792;5117248;241280;5117888;236640;5113184;229728 | |
393 | densenet0_stage4_concat11 | Concat | [32,864,7,7] | 443.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.00 | 0 | 2809968.00 | 2677402.67 | 53.60 | 0.00 | 0.00 | true | 0.813569;0.254090;0.813728;0.255900;0.814110;0.261605;0.816501;0.253865;0.814299;0.252819 | 0;0;0;0;0;0;0;0;0;0 | 5419072;200864;5419072;200864;5419072;200864;5419072;200864;5423424;200864 | 5114624;239232;5114304;241792;5117248;241280;5117888;236640;5113184;229728 | |
394 | densenet0_stage4_batchnorm24_fwd | BatchNorm | [32,896,7,7] | 752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 51.67 | 9834496 | 5532746.67 | 6456874.67 | 20.30 | 0.82 | 190.34 | true | 0.202370;0.201656;0.201433;0.203605;0.215936 | 9834496;9834496;9834496;9834496;9834496 | 5533216;5532320;5532064;5536928;5532704 | 6439968;6449824;6448064;6472736;6488000 | |
395 | densenet0_stage4_relu24_fwd | Activation | [32,896,7,7] | 518.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 14.67 | 2809856 | 2362336.00 | 5448128.00 | 91.60 | 0.36 | 191.58 | true | 0.916424;0.915660;0.917086;0.915679;0.914083 | 2809856;2809856;2809856;2809856;2809856 | 2373472;2354656;2369120;2363232;2349920 | 5461408;5456192;5453088;5435104;5431648 | |
396 | densenet0_stage4_conv24_fwd | Convolution | [32,896,7,7] | 24968 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 89.33 | 359862272 | 474602.67 | 1252725.33 | 7.00 | 208.33 | 4028.32 | false | 0.070542;0.070536;0.069754;0.069668;0.070377 | 359862272;359862272;359862272;359862272;359862272 | 482752;478464;467840;470848;474496 | 1256512;1242656;1250752;1255072;1252352 | |
397 | densenet0_stage4_batchnorm25_fwd | BatchNorm | [32,128,7,7] | 181.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2208.00 | 757109.33 | 15.20 | 1.85 | 131.71 | true | 0.150202;0.159192;0.150057;0.152534;0.152877 | 1404928;1404928;1404928;1404928;1404928 | 2208;2208;2208;2208;2464 | 759040;758304;757376;752448;755648 | |
398 | densenet0_stage4_relu25_fwd | Activation | [32,128,7,7] | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 21162.67 | 55.90 | 18.71 | 80.28 | false | 0.560512;0.554581;0.558256;0.558423;0.563487 | 401408;401408;401408;401408;401408 | 288;288;288;288;288 | 20896;22016;20576;22688;19808 | |
399 | densenet0_stage4_conv25_fwd | Convolution | [32,128,7,7] | 11015.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 252032.00 | 12.50 | 558.81 | 4062.56 | false | 0.124899;0.124898;0.124899;0.124900;0.124898 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 251808;242208;252224;252416;252064 | |
399 | densenet0_stage4_conv25_fwd | Convolution | [32,128,7,7] | 11015.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 4672.00 | 6.20 | 1.56 | 59.39 | true | 0.062335;0.062335;0.062353;0.062340;0.062344 | 237568;237568;237568;237568;237568 | 147456;152832;147456;147456;147456 | 1920;12384;2656;2688;8672 | |
400 | densenet0_stage4_concat12 | Concat | [32,896,7,7] | 451 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.14 | 0 | 2911301.33 | 2780682.67 | 53.90 | 0.00 | 0.00 | true | 0.818328;0.258595;0.819405;0.259596;0.815948;0.258482;0.819055;0.259166;0.819886;0.259072 | 0;0;0;0;0;0;0;0;0;0 | 5252864;306144;5265728;302560;5266272;304352;5256480;294112;5261696;283776 | 5619776;200864;5619776;200864;5619776;200864;5619776;200864;5619776;206752 | |
400 | densenet0_stage4_concat12 | Concat | [32,896,7,7] | 451 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.14 | 0 | 2911301.33 | 2780682.67 | 53.90 | 0.00 | 0.00 | true | 0.818328;0.258595;0.819405;0.259596;0.815948;0.258482;0.819055;0.259166;0.819886;0.259072 | 0;0;0;0;0;0;0;0;0;0 | 5619776;200864;5619776;200864;5619776;200864;5619776;200864;5619776;206752 | 5252864;306144;5265728;302560;5266272;304352;5256480;294112;5261696;283776 | |
401 | densenet0_stage4_batchnorm26_fwd | BatchNorm | [32,928,7,7] | 781.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 53.00 | 10185728 | 5728800.00 | 6627168.00 | 20.60 | 0.82 | 192.18 | true | 0.209805;0.204033;0.203009;0.204555;0.209826 | 10185728;10185728;10185728;10185728;10185728 | 5728800;5729056;5728800;5728800;5728800 | 6625728;6673024;6633568;6622208;6606048 | |
402 | densenet0_stage4_relu26_fwd | Activation | [32,928,7,7] | 529 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 15.67 | 2910208 | 2781664.00 | 5692704.00 | 90.50 | 0.34 | 185.75 | true | 0.911466;0.906866;0.904192;0.903373;0.905373 | 2910208;2910208;2910208;2910208;2910208 | 2781536;2755168;2778976;2794848;2784480 | 5686880;5658304;5694880;5696352;5706848 | |
403 | densenet0_stage4_conv26_fwd | Convolution | [32,928,7,7] | 25728.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 93.00 | 372707328 | 930165.33 | 1396906.67 | 7.00 | 160.16 | 4007.61 | false | 0.069712;0.069659;0.070550;0.069841;0.069699 | 372707328;372707328;372707328;372707328;372707328 | 930656;927360;929632;930208;931424 | 1403776;1381984;1399360;1395392;1395968 | |
404 | densenet0_stage4_batchnorm27_fwd | BatchNorm | [32,128,7,7] | 186.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2208.00 | 630304.00 | 15.40 | 2.22 | 127.72 | true | 0.152653;0.155127;0.153307;0.151143;0.160756 | 1404928;1404928;1404928;1404928;1404928 | 2208;2208;2208;2208;2208 | 635392;632160;631200;624320;627552 | |
405 | densenet0_stage4_relu27_fwd | Activation | [32,128,7,7] | 121.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 18816.00 | 56.20 | 21.01 | 80.28 | false | 0.561990;0.562383;0.563109;0.564840;0.555649 | 401408;401408;401408;401408;401408 | 17632;18912;19968;19104;18432 | 288;288;288;288;288 | |
406 | densenet0_stage4_conv27_fwd | Convolution | [32,128,7,7] | 11015.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 237354.67 | 12.50 | 593.36 | 4023.91 | false | 0.124898;0.124900;0.124897;0.124900;0.124899 | 140836864;140836864;140836864;140836864;140836864 | 234464;231872;235584;242016;245792 | 0;0;0;0;0 | |
406 | densenet0_stage4_conv27_fwd | Convolution | [32,128,7,7] | 11015.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 1824.00 | 6.20 | 1.59 | 54.83 | true | 0.062336;0.062335;0.062341;0.062341;0.062332 | 237568;237568;237568;237568;237568 | 147456;149504;147456;147456;147456 | 1536;7904;2400;1408;1536 | |
407 | densenet0_stage4_concat13 | Concat | [32,928,7,7] | 477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.57 | 0 | 3010672.00 | 2871845.33 | 54.10 | 0.00 | 0.00 | true | 0.822405;0.256645;0.822632;0.258830;0.825482;0.258422;0.816246;0.264633;0.822615;0.259222 | 0;0;0;0;0;0;0;0;0;0 | 5439904;298976;5458240;293600;5446528;296544;5455520;286976;5456672;290944 | 5820480;200864;5820480;200864;5820480;200864;5820480;200864;5820480;200864 | |
407 | densenet0_stage4_concat13 | Concat | [32,928,7,7] | 477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.43 | 0 | 3010672.00 | 2871845.33 | 54.10 | 0.00 | 0.00 | true | 0.822405;0.256645;0.822632;0.258830;0.825482;0.258422;0.816246;0.264633;0.822615;0.259222 | 0;0;0;0;0;0;0;0;0;0 | 5820480;200864;5820480;200864;5820480;200864;5820480;200864;5820480;200864 | 5439904;298976;5458240;293600;5446528;296544;5455520;286976;5456672;290944 | |
408 | densenet0_stage4_batchnorm28_fwd | BatchNorm | [32,960,7,7] | 812 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 54.67 | 10536960 | 5936160.00 | 6821024.00 | 20.50 | 0.83 | 192.75 | true | 0.207078;0.195370;0.210030;0.202333;0.204133 | 10536960;10536960;10536960;10536960;10536960 | 6818496;6820416;6820704;6821952;6837920 | 5936160;5936160;5936160;5936032;5936160 | |
409 | densenet0_stage4_relu28_fwd | Activation | [32,960,7,7] | 547 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 16.00 | 3010560 | 3083786.67 | 5962080.00 | 91.40 | 0.33 | 188.16 | true | 0.915330;0.915185;0.917090;0.908124;0.911384 | 3010560;3010560;3010560;3010560;3010560 | 3084384;3085664;3081312;3077344;3087968 | 5960864;5960352;5965248;5965024;5956064 | |
410 | densenet0_stage4_conv28_fwd | Convolution | [32,960,7,7] | 26448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 95.00 | 385552384 | 929792.00 | 1414496.00 | 7.10 | 164.46 | 4058.45 | false | 0.069942;0.071069;0.071272;0.070278;0.071051 | 385552384;385552384;385552384;385552384;385552384 | 928832;927648;924704;932896;938304 | 1403872;1427072;1412544;1432512;1401376 | |
411 | densenet0_stage4_batchnorm29_fwd | BatchNorm | [32,128,7,7] | 185.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 1404928 | 2208.00 | 609109.33 | 15.50 | 2.30 | 127.72 | true | 0.155949;0.154902;0.155868;0.153486;0.155663 | 1404928;1404928;1404928;1404928;1404928 | 2208;2208;2208;2464;2208 | 622400;612736;613472;599200;601120 | |
412 | densenet0_stage4_relu29_fwd | Activation | [32,128,7,7] | 120.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 288.00 | 16341.33 | 56.00 | 24.14 | 86.01 | false | 0.567161;0.557484;0.563118;0.556445;0.559644 | 401408;401408;401408;401408;401408 | 288;288;288;5408;288 | 17664;15744;16960;15968;16096 | |
413 | densenet0_stage4_conv29_fwd | Convolution | [32,128,7,7] | 11031.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 230357.33 | 12.50 | 611.38 | 4062.56 | false | 0.124899;0.124899;0.124900;0.124898;0.124901 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 220064;225216;233312;232544;251328 | |
413 | densenet0_stage4_conv29_fwd | Convolution | [32,128,7,7] | 11031.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147456.00 | 16085.33 | 6.20 | 1.45 | 54.83 | true | 0.062332;0.062332;0.062329;0.062344;0.062331 | 237568;237568;237568;237568;237568 | 147456;147456;147712;147456;147456 | 23936;12800;15232;20224;9728 | |
414 | densenet0_stage4_concat14 | Concat | [32,960,7,7] | 482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.43 | 0 | 3111194.67 | 2960133.33 | 53.80 | 0.00 | 0.00 | true | 0.818072;0.258595;0.822666;0.253491;0.819301;0.257216;0.821349;0.256759;0.818756;0.256807 | 0;0;0;0;0;0;0;0;0;0 | 6021184;200864;6021184;200864;6022720;200864;6021312;201888;6021184;200864 | 5646944;283872;5634464;282080;5637216;283488;5632384;275168;5639872;289376 | |
414 | densenet0_stage4_concat14 | Concat | [32,960,7,7] | 482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.14 | 0 | 3111194.67 | 2960133.33 | 53.80 | 0.00 | 0.00 | true | 0.818072;0.258595;0.822666;0.253491;0.819301;0.257216;0.821349;0.256759;0.818756;0.256807 | 0;0;0;0;0;0;0;0;0;0 | 6021184;200864;6021184;200864;6022720;200864;6021312;201888;6021184;200864 | 5646944;283872;5634464;282080;5637216;283488;5632384;275168;5639872;289376 | |
415 | densenet0_stage4_batchnorm30_fwd | BatchNorm | [32,992,7,7] | 825.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 56.00 | 10888192 | 6137504.00 | 7002997.33 | 20.60 | 0.83 | 194.43 | true | 0.204394;0.204389;0.208445;0.207936;0.201000 | 10888192;10888192;10888192;10888192;10888192 | 6137376;6137504;6137056;6137632;6137888 | 6980480;6999840;7005248;7003904;7023808 | |
416 | densenet0_stage4_relu30_fwd | Activation | [32,992,7,7] | 571 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 17.00 | 3110912 | 3611146.67 | 6233002.67 | 91.80 | 0.32 | 182.99 | true | 0.916870;0.921767;0.915001;0.915620;0.920385 | 3110912;3110912;3110912;3110912;3110912 | 3605856;3618144;3605344;3616992;3610592 | 6248256;6234400;6237312;6227296;6216128 | |
417 | densenet0_stage4_conv30_fwd | Convolution | [32,992,7,7] | 27702.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 98.00 | 398397440 | 1484149.33 | 1535445.33 | 7.10 | 131.94 | 4065.28 | false | 0.071499;0.071389;0.071752;0.071244;0.070688 | 398397440;398397440;398397440;398397440;398397440 | 1489920;1472832;1478368;1491584;1484160 | 1505696;1522560;1543776;1540000;1555616 | |
418 | densenet0_stage4_batchnorm31_fwd | BatchNorm | [32,128,7,7] | 187.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 1404928 | 2208.00 | 530645.33 | 15.30 | 2.64 | 131.71 | true | 0.151863;0.155439;0.160111;0.150314;0.152188 | 1404928;1404928;1404928;1404928;1404928 | 2208;2208;2208;2208;2208 | 557888;535008;527744;529184;525824 | |
419 | densenet0_stage4_relu31_fwd | Activation | [32,128,7,7] | 123 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 288.00 | 12096.00 | 56.20 | 32.41 | 80.28 | false | 0.561826;0.566138;0.554663;0.563464;0.560009 | 401408;401408;401408;401408;401408 | 288;288;288;288;288 | 12160;12256;11872;12928;11232 | |
420 | densenet0_stage4_conv31_fwd | Convolution | [32,128,7,7] | 10977 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 140836864 | 0.00 | 224949.33 | 12.50 | 626.08 | 4062.56 | false | 0.124898;0.124900;0.124900;0.124899;0.124900 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 231360;228608;222528;223712;221920 | |
420 | densenet0_stage4_conv31_fwd | Convolution | [32,128,7,7] | 10977 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147456.00 | 9674.67 | 6.20 | 1.51 | 47.51 | true | 0.062331;0.062341;0.062329;0.062331;0.062342 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 6144;8672;11392;8960;11648 | |
421 | densenet0_stage4_concat15 | Concat | [32,992,7,7] | 578.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.14 | 0 | 3211376.00 | 3043898.67 | 54.20 | 0.00 | 0.00 | true | 0.821556;0.264310;0.822358;0.257217;0.822357;0.261078;0.822649;0.258753;0.821430;0.261457 | 0;0;0;0;0;0;0;0;0;0 | 6221888;200864;6221888;200864;6221888;200864;6227008;200864;6223936;200864 | 5813824;275072;5815008;282240;5805696;280704;5827040;268416;5805856;272384 | |
421 | densenet0_stage4_concat15 | Concat | [32,992,7,7] | 578.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.86 | 0 | 3211376.00 | 3043898.67 | 54.20 | 0.00 | 0.00 | true | 0.821556;0.264310;0.822358;0.257217;0.822357;0.261078;0.822649;0.258753;0.821430;0.261457 | 0;0;0;0;0;0;0;0;0;0 | 6221888;200864;6221888;200864;6221888;200864;6227008;200864;6223936;200864 | 5813824;275072;5815008;282240;5805696;280704;5827040;268416;5805856;272384 | |
422 | densenet0_batchnorm4_fwd | BatchNorm | [32,1024,7,7] | 878 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 57.67 | 11239424 | 6344778.67 | 7251114.67 | 19.90 | 0.83 | 194.90 | true | 0.198380;0.198874;0.201886;0.198954;0.196999 | 11239424;11239424;11239424;11239424;11239424 | 6344800;6344608;6344928;6344672;6344864 | 7258912;7243584;7243360;7250848;7265504 | |
423 | densenet0_relu4_fwd | Activation | [32,1024,7,7] | 583.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 17.67 | 3211264 | 4028426.67 | 6426858.67 | 91.60 | 0.31 | 181.77 | true | 0.915121;0.916783;0.912965;0.915039;0.917758 | 3211264;3211264;3211264;3211264;3211264 | 4024288;4028384;4037728;4026592;4030304 | 6425536;6432864;6427328;6427712;6415488 | |
424 | densenet0_pool4_fwd | Pooling | [32,1024,7,7] | 2118 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 2296288 | 782805.33 | 1657034.67 | 32.10 | 0.94 | 191.36 | true | 0.319902;0.321445;0.320788;0.320804;0.320909 | 2296288;2296288;2296288;2296288;2296288 | 780800;786112;780160;783360;784256 | 1648192;1673120;1654976;1660544;1655584 | |
426 | densenet0_dense0_fwd | FullyConnected | [32,1024] | 4880.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_32x32_sliced1x4_tn | 28.67 | 67633152 | 4111626.67 | 929109.33 | 6.20 | 13.42 | 2359.27 | true | 0.061996;0.062002;0.061937;0.061942;0.061950 | 67633152;67633152;67633152;67633152;67633152 | 4110688;4116576;4110688;4110688;4113504 | 933952;907168;934304;919072;948064 | |
426 | densenet0_dense0_fwd | FullyConnected | [32,1024] | 4880.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 4.33 | 32000 | 6442.67 | 4309.33 | 19.00 | 2.98 | 7.39 | true | 0.188410;0.189389;0.190858;0.190093;0.190869 | 32000;32000;32000;32000;32000 | 5760;19072;7808;5760;5760 | 3712;13824;4992;3840;4096 |
Showing 1 to 612 of 612 entries