GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/mul-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[128 3 224 224]] | 359.667 | 77070336 | 77070336 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 275.33 | 0 | 77077600.00 | 79710400.00 | 93.30 | 0.00 | 0.00 | true | 0.933051;0.933138;0.933062;0.933137;0.932935 | 0;0;0;0;0 | 77077600;77077600;77077600;77077600;77077600 | 79703296;79723168;79704736;79672800;79763136 |
2 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/mul | Conv2D | [[128 64 112 112]] | 3116.667 | 411041792 | 488914944 | 514591232 | GPU_0_bfc | 77873152 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 2582.67 | 31444697088 | 11255360.00 | 262971498.67 | 24.90 | 114.67 | 12175.28 | false | 0.248949;0.248949;0.248950;0.248960;0.248969 | 31444697088;31444697088;31444697088;31444697088;31444697088 | 11303488;11501184;11202624;11259968;11140288 | 263864416;263843232;262057568;263013696;260069632 |
2 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/mul | Conv2D | [[128 64 112 112]] | 3116.667 | 411041792 | 488914944 | 514591232 | GPU_0_bfc | 77873152 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 322.00 | 0 | 79864416.00 | 78048373.33 | 47.60 | 0.00 | 0.00 | true | 0.476530;0.476187;0.476435;0.476002;0.476560 | 0;0;0;0;0 | 79778528;79801504;79844576;80088480;79947168 | 78053664;78045472;78047392;78052256;78036640 |
2 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/mul | Conv2D | [[128 64 112 112]] | 3116.667 | 411041792 | 488914944 | 514591232 | GPU_0_bfc | 77873152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 37824.00 | 32554.67 | 42.40 | 0.00 | 0.00 | true | 0.423832;0.424939;0.423472;0.424379;0.424609 | 0;0;0;0;0 | 37824;37824;37824;37824;37824 | 31744;32640;32512;34048;32512 |
2 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/mul | Conv2D | [[128 64 112 112]] | 3116.667 | 411041792 | 488914944 | 514591232 | GPU_0_bfc | 77873152 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 2400.00 | 47573.33 | 7.50 | 0.00 | 0.00 | true | 0.075536;0.075588;0.075240;0.075317;0.075342 | 0;0;0;0;0 | 2400;2400;2400;2400;2400 | 46720;47616;47360;53632;47744 |
3 | InceptionV1/InceptionV1/Conv2d_1a_7x7/BatchNorm/batchnorm/add_1 | Add | [[128 64 112 112]] | 1057.667 | 411041792 | 0 | 437520896 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 1022.00 | 102760448 | 59228170.67 | 119689248.00 | 49.90 | 0.57 | 100.55 | true | 0.498841;0.498757;0.498841;0.498868;0.498812 | 102760448;102760448;102760448;102760448;102760448 | 59849536;59308128;59334880;55044416;59041504 | 120901440;119877024;119871616;111255552;119319104 |
4 | InceptionV1/InceptionV1/Conv2d_1a_7x7/Relu | Relu | [[128 64 112 112]] | 1051.333 | 411041792 | 0 | 437520896 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 1026.33 | 0 | 59285344.00 | 120621664.00 | 98.20 | 0.00 | 0.00 | true | 0.981635;0.981725;0.982730;0.982037;0.981324 | 0;0;0;0;0 | 119356800;121615360;114135264;123056960;120892832 | 58642368;59783328;56032032;60457824;59430336 |
5 | InceptionV1/InceptionV1/MaxPool_2a_3x3/MaxPool | MaxPool | [[128 64 56 56]] | 882.667 | 154140672 | 154140672 | 591661568 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 836.00 | 25690112 | 27964928.00 | 16866922.67 | 72.20 | 0.57 | 30.73 | true | 0.722567;0.722105;0.722207;0.722229;0.722850 | 25690112;25690112;25690112;25690112;25690112 | 27261920;28149376;29021664;28125056;27620352 | 16530720;16757312;17667616;16950304;16893152 |
6 | InceptionV1/InceptionV1/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 56 56]] | 411 | 102760448 | 102795776 | 283380224 | GPU_0_bfc | 35328 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 303.00 | 3339714560 | 102791210.67 | 100876864.00 | 24.20 | 16.40 | 11022.16 | true | 0.242278;0.242357;0.242311;0.242283;0.242278 | 3339714560;3339714560;3339714560;3339714560;3339714560 | 102792064;102782464;102793600;102787968;102794112 | 100842208;101091520;100903968;100884416;100829408 |
6 | InceptionV1/InceptionV1/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 56 56]] | 411 | 102760448 | 102795776 | 283380224 | GPU_0_bfc | 35328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 16576.00 | 3520.00 | 41.60 | 0.00 | 0.00 | true | 0.416324;0.415871;0.414717;0.419369;0.416188 | 0;0;0;0;0 | 16832;16576;16576;16576;16576 | 1664;7296;3488;2944;4128 |
6 | InceptionV1/InceptionV1/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 56 56]] | 411 | 102760448 | 102795776 | 283380224 | GPU_0_bfc | 35328 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 1696.00 | 6.10 | 0.00 | 0.00 | true | 0.061155;0.061141;0.061105;0.061158;0.061145 | 0;0;0;0;0 | 864;1344;2880;3296;512 | 352;352;352;352;352 |
7 | InceptionV1/InceptionV1/Conv2d_2b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 56 56]] | 282 | 102760448 | 0 | 129239552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 256.67 | 25690112 | 102762282.67 | 102813600.00 | 49.50 | 0.12 | 100.09 | true | 0.495053;0.495053;0.495278;0.495226;0.494804 | 25690112;25690112;25690112;25690112;25690112 | 102762496;102762560;102762496;102761856;102761472 | 102800672;102812960;102810144;102825760;102817696 |
8 | InceptionV1/InceptionV1/Conv2d_2b_1x1/Relu | Relu | [[128 64 56 56]] | 276.333 | 102760448 | 0 | 129239552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 256.67 | 0 | 102760608.00 | 102753952.00 | 97.30 | 0.00 | 0.00 | true | 0.973708;0.972215;0.972577;0.972796;0.972692 | 0;0;0;0;0 | 102760608;102760608;102760608;102760608;102760608 | 102751488;102748416;102756096;102755328;102755040 |
9 | InceptionV1/InceptionV1/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 56 56]] | 4490.333 | 308281344 | 309953280 | 437520896 | GPU_0_bfc | 1671936 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 4359.67 | 47783608320 | 27874069.33 | 146476192.00 | 15.80 | 274.07 | 10960.38 | false | 0.157419;0.164686;0.157426;0.158276;0.157383 | 47783608320;47783608320;47783608320;47783608320;47783608320 | 146269312;145933184;149828896;145638112;147226080 | 27846624;27396832;27878848;27896736;28043840 |
9 | InceptionV1/InceptionV1/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 56 56]] | 4490.333 | 308281344 | 309953280 | 437520896 | GPU_0_bfc | 1671936 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 442560.00 | 571509.33 | 46.20 | 0.00 | 0.00 | true | 0.461046;0.461300;0.462069;0.462506;0.461383 | 0;0;0;0;0 | 444864;442560;442560;442560;442560 | 579744;572192;569888;572448;567840 |
9 | InceptionV1/InceptionV1/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 56 56]] | 4490.333 | 308281344 | 309953280 | 437520896 | GPU_0_bfc | 1671936 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 712704 | 3520.00 | 667498.67 | 7.50 | 1.06 | 118.78 | true | 0.074875;0.074633;0.074891;0.074664;0.074635 | 712704;712704;712704;712704;712704 | 3520;3520;3520;3520;3520 | 666688;673856;658624;667584;668224 |
10 | InceptionV1/InceptionV1/Conv2d_2c_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 192 56 56]] | 797.333 | 308281344 | 0 | 334760448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 768.67 | 77070336 | 88009589.33 | 105626048.00 | 49.90 | 0.40 | 100.26 | true | 0.499110;0.499157;0.499151;0.499112;0.499131 | 77070336;77070336;77070336;77070336;77070336 | 85898048;88722656;96278528;87916480;87389632 | 105477920;105943424;151491264;99519776;105456800 |
11 | InceptionV1/InceptionV1/Conv2d_2c_3x3/Relu | Relu | [[128 192 56 56]] | 795.333 | 308281344 | 0 | 334760448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 773.00 | 0 | 6543872.00 | 14562848.00 | 98.00 | 0.00 | 0.00 | true | 0.980210;0.980320;0.980259;0.979964;0.980604 | 0;0;0;0;0 | 8464768;1664;4292352;9120704;6874496 | 18490272;8384;9917760;19810976;15280512 |
12 | InceptionV1/InceptionV1/MaxPool_3a_3x3/MaxPool | MaxPool | [[128 192 28 28]] | 671.333 | 77070336 | 77070336 | 411830784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 624.33 | 19267584 | 128.00 | 1280.00 | 71.90 | 13684.36 | 30.86 | false | 0.718644;0.718958;0.719201;0.718479;0.719074 | 19267584;19267584;19267584;19267584;19267584 | 1280;1280;1280;1280;1280 | 128;128;128;128;128 |
13 | InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 192 28 28]] | 488.333 | 77070336 | 77070336 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 455.67 | 19267584 | 58605461.33 | 71101194.67 | 70.70 | 0.15 | 42.28 | true | 0.707477;0.707405;0.707409;0.707364;0.707353 | 19267584;19267584;19267584;19267584;19267584 | 55394880;56599104;62618752;62618688;56598592 | 67010272;70149024;75976256;75971968;67182592 |
14 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 28 28]] | 263.667 | 6422528 | 6439680 | 187042304 | GPU_0_bfc | 17152 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 157.33 | 1239547904 | 77113664.00 | 7779082.67 | 20.10 | 14.60 | 7878.50 | true | 0.200930;0.199819;0.201910;0.200705;0.201691 | 1239547904;1239547904;1239547904;1239547904;1239547904 | 77113664;77113536;77113664;77113664;77113920 | 7781184;7772352;7782720;7783616;7773344 |
14 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 28 28]] | 263.667 | 6422528 | 6439680 | 187042304 | GPU_0_bfc | 17152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 12480.00 | 5472.00 | 44.70 | 0.00 | 0.00 | true | 0.446265;0.447348;0.446684;0.447505;0.446182 | 0;0;0;0;0 | 12480;12480;12480;12480;12480 | 4640;6688;5696;4928;5792 |
14 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 28 28]] | 263.667 | 6422528 | 6439680 | 187042304 | GPU_0_bfc | 17152 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 1482.67 | 5.90 | 0.00 | 0.00 | true | 0.059318;0.059333;0.059912;0.059343;0.059305 | 0;0;0;0;0 | 96;96;96;96;96 | 1888;1152;1856;1376;1216 |
15 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 512 | 38535168 | 38613760 | 225577472 | GPU_0_bfc | 78592 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 406.33 | 4958191616 | 149408320.00 | 36337802.67 | 23.80 | 26.69 | 12202.29 | false | 0.238037;0.237901;0.238405;0.239018;0.237936 | 4958191616;4958191616;4958191616;4958191616;4958191616 | 154221632;149397632;149394432;149432896;137376960 | 37499136;36231296;36434304;36347808;32930240 |
15 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 512 | 38535168 | 38613760 | 225577472 | GPU_0_bfc | 78592 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 74176.00 | 0.00 | 42.70 | 0.00 | 0.00 | true | 0.426309;0.425855;0.428361;0.431085;0.426023 | 0;0;0;0;0 | 74176;74176;74176;74176;74176 | 0;0;0;0;0 |
15 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 512 | 38535168 | 38613760 | 225577472 | GPU_0_bfc | 78592 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 352.00 | 0.00 | 5.90 | 0.00 | 0.00 | true | 0.059349;0.059335;0.059356;0.059351;0.059337 | 0;0;0;0;0 | 352;352;352;352;352 | 0;0;0;0;0 |
16 | InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 323.667 | 25690112 | 25744128 | 251267584 | GPU_0_bfc | 54016 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 228.00 | 2479095808 | 77120085.33 | 25101354.67 | 22.80 | 24.25 | 10873.23 | false | 0.228881;0.230648;0.228287;0.228020;0.227158 | 2479095808;2479095808;2479095808;2479095808;2479095808 | 77118080;77103104;77103104;77139072;77163008 | 25142848;25205088;25052960;25108256;25023488 |
16 | InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 323.667 | 25690112 | 25744128 | 251267584 | GPU_0_bfc | 54016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 49344.00 | 34304.00 | 41.90 | 0.00 | 0.00 | true | 0.419245;0.417950;0.420869;0.417969;0.422377 | 0;0;0;0;0 | 49344;49344;49344;49344;49344 | 36992;34368;32960;35584;22976 |
16 | InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 323.667 | 25690112 | 25744128 | 251267584 | GPU_0_bfc | 54016 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 4202.67 | 5.90 | 0.00 | 0.00 | true | 0.059470;0.059299;0.059299;0.059292;0.059283 | 0;0;0;0;0 | 96;96;96;96;96 | 3904;3968;4800;4736;3904 |
17 | InceptionV1/InceptionV1/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 262 | 12845056 | 12874496 | 187042304 | GPU_0_bfc | 29440 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 154.67 | 1239547904 | 77114773.33 | 13357301.33 | 19.70 | 13.70 | 8014.30 | true | 0.195967;0.197548;0.194744;0.198790;0.200285 | 1239547904;1239547904;1239547904;1239547904;1239547904 | 77114816;77114560;77116992;77114816;77114688 | 13359008;13401504;13341024;13362464;13350432 |
17 | InceptionV1/InceptionV1/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 262 | 12845056 | 12874496 | 187042304 | GPU_0_bfc | 29440 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 24768.00 | 17941.33 | 41.90 | 0.00 | 0.00 | true | 0.417756;0.419643;0.418091;0.419248;0.419371 | 0;0;0;0;0 | 24768;24768;24768;24768;24768 | 17472;17408;18112;18240;20480 |
17 | InceptionV1/InceptionV1/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 262 | 12845056 | 12874496 | 187042304 | GPU_0_bfc | 29440 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2154.67 | 5.90 | 0.00 | 0.00 | true | 0.059298;0.059303;0.059301;0.059308;0.059293 | 0;0;0;0;0 | 96;96;96;1888;96 | 2048;2368;1984;3968;2048 |
18 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 16 28 28]] | 45 | 6422528 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 19.00 | 1605632 | 6423189.33 | 5627210.67 | 44.20 | 0.13 | 84.51 | true | 0.438763;0.444387;0.442350;0.445552;0.440361 | 1605632;1605632;1605632;1605632;1605632 | 6423104;6430016;6423360;6423104;6423104 | 5620896;5638304;5626656;5620768;5634080 |
19 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 96 28 28]] | 117.333 | 38535168 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 100.33 | 9633792 | 38535786.67 | 38508565.33 | 48.90 | 0.13 | 96.02 | true | 0.488434;0.489525;0.488823;0.488532;0.488551 | 9633792;9633792;9633792;9633792;9633792 | 38535744;38536896;38535744;38535872;38535744 | 38519712;38498208;38506016;38543904;38499968 |
20 | InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 28 28]] | 85.667 | 25690112 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 68.67 | 6422528 | 25690624.00 | 25677973.33 | 48.10 | 0.13 | 93.53 | true | 0.480605;0.482069;0.482266;0.481272;0.480548 | 6422528;6422528;6422528;6422528;6422528 | 25699840;25690624;25690624;25690624;25690624 | 25670784;25696032;25675296;25659936;25687840 |
21 | InceptionV1/InceptionV1/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 32 28 28]] | 55 | 12845056 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 35.00 | 3211264 | 12845440.00 | 12835520.00 | 46.80 | 0.13 | 91.75 | true | 0.468039;0.466689;0.467883;0.467619;0.467221 | 3211264;3211264;3211264;3211264;3211264 | 12845440;12845440;12845440;12848000;12845440 | 12836512;12799392;12839040;12831008;12842784 |
22 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 16 28 28]] | 37 | 6422528 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 19.00 | 0 | 6422944.00 | 6269610.67 | 85.20 | 0.00 | 0.00 | true | 0.854735;0.853689;0.847187;0.856303;0.847316 | 0;0;0;0;0 | 6422944;6422944;6428320;6422944;6422944 | 6261760;6283520;6289248;6252416;6263552 |
23 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 96 28 28]] | 115.333 | 38535168 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 97.67 | 0 | 38535328.00 | 38664256.00 | 95.40 | 0.00 | 0.00 | true | 0.954136;0.955943;0.949438;0.951388;0.956999 | 0;0;0;0;0 | 38535328;38535328;38535328;38535328;38535328 | 38669152;38663520;38660096;38699264;38652000 |
24 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 214 | 12845056 | 12914944 | 122817024 | GPU_0_bfc | 69888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 108.33 | 601882624 | 21306922.67 | 18411328.00 | 23.90 | 15.15 | 5555.86 | true | 0.238357;0.238295;0.240457;0.238877;0.239368 | 601882624;601882624;601882624;601882624;601882624 | 21282912;21360064;21359872;21277984;21262144 | 18408128;18423904;18409792;18408704;18415488 |
24 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 214 | 12845056 | 12914944 | 122817024 | GPU_0_bfc | 69888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 18624.00 | 39808.00 | 40.20 | 0.00 | 0.00 | true | 0.399570;0.404108;0.406693;0.397506;0.401210 | 0;0;0;0;0 | 18624;18624;18624;18624;18624 | 39296;39936;38656;40192;40320 |
24 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 214 | 12845056 | 12914944 | 122817024 | GPU_0_bfc | 69888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 29696 | 192.00 | 54442.67 | 6.20 | 0.54 | 7.42 | true | 0.062321;0.062321;0.062332;0.062339;0.062325 | 29696;29696;29696;29696;29696 | 2496;192;192;192;192 | 62848;54400;54144;52992;54784 |
25 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 1051.667 | 51380224 | 230105088 | 167774720 | GPU_0_bfc | 178724864 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 467.33 | 5292032000 | 72449098.67 | 48545770.67 | 16.90 | 43.74 | 11323.90 | false | 0.166634;0.170023;0.171315;0.166984;0.168849 | 5292032000;5292032000;5292032000;5292032000;5292032000 | 71900832;71889952;71885728;73556512;76902176 | 48225344;48130496;48132288;49279680;51419072 |
25 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 1051.667 | 51380224 | 230105088 | 167774720 | GPU_0_bfc | 178724864 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 183.67 | 352321536 | 71276416.00 | 55638090.67 | 47.30 | 2.78 | 1918.26 | true | 0.474309;0.473577;0.473367;0.473433;0.472425 | 352321536;352321536;352321536;352321536;352321536 | 71278176;71276256;71274816;71273344;71297504 | 55672288;55584224;55626816;55615168;55686848 |
25 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 1051.667 | 51380224 | 230105088 | 167774720 | GPU_0_bfc | 178724864 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 154.67 | 277413888 | 38913568.00 | 53628000.00 | 46.90 | 3.00 | 1793.62 | true | 0.468945;0.470705;0.469291;0.468695;0.469020 | 277413888;277413888;277413888;277413888;277413888 | 38896736;38955680;38903328;38934048;38903328 | 53569280;53630272;53640640;53613088;53767840 |
25 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 1051.667 | 51380224 | 230105088 | 167774720 | GPU_0_bfc | 178724864 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 99.33 | 277413888 | 633109.33 | 51246965.33 | 46.30 | 5.35 | 2792.77 | true | 0.463580;0.460006;0.462877;0.462730;0.462331 | 277413888;277413888;277413888;277413888;277413888 | 51200064;51281408;51253632;51239872;51247392 | 628896;633568;633280;634208;632480 |
25 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 1051.667 | 51380224 | 230105088 | 167774720 | GPU_0_bfc | 178724864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 552576.00 | 290346.67 | 45.00 | 0.00 | 0.00 | true | 0.450012;0.448689;0.451145;0.449578;0.453292 | 0;0;0;0;0 | 552896;553824;550848;551008;554304 | 291680;289184;287808;290176;301984 |
26 | InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 32 28 28]] | 63.667 | 12845056 | 0 | 129239552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 37.33 | 3211264 | 12848512.00 | 12915520.00 | 47.00 | 0.12 | 86.02 | true | 0.469528;0.470776;0.469469;0.470176;0.466763 | 3211264;3211264;3211264;3211264;3211264 | 12923584;12910912;12911360;12911616;12930880 | 12848512;12848512;12848512;12848768;12848512 |
27 | InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 128 28 28]] | 150 | 51380224 | 0 | 129239552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 131.00 | 12845056 | 51380992.00 | 51426122.67 | 49.10 | 0.12 | 98.05 | true | 0.490872;0.490455;0.490333;0.490438;0.490655 | 12845056;12845056;12845056;12845056;12845056 | 51380992;51380992;51380992;51380992;51380992 | 51415328;51425312;51437728;51438080;51402912 |
28 | InceptionV1/InceptionV1/Mixed_3b/concat | ConcatV2 | [[128 256 28 28]] | 339 | 122028032 | 122028032 | 251267584 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 67.55 | 0 | 21409333.33 | 21460242.67 | 95.90 | 0.00 | 0.00 | true | 0.968322;0.976026;0.949135;0.947274;0.965142;0.977053;0.947591;0.946868;0.961113;0.976458;0.956867;0.951522;0.966137;0.975854;0.948398;0.947394;0.963383;0.975684;0.954595;0.950941 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845472;25693856;51380384;12845216;12845216 | 25821280;51401568;12767328;12869984;25812576;51403776;12778624;12833504;25811424;51374048;12832096;12825600;25808512;51412352;12777568;12836064;25821536;51383008;12807136;12876288 |
28 | InceptionV1/InceptionV1/Mixed_3b/concat | ConcatV2 | [[128 256 28 28]] | 339 | 122028032 | 122028032 | 251267584 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 61.45 | 0 | 21409333.33 | 21460242.67 | 95.90 | 0.00 | 0.00 | true | 0.968322;0.976026;0.949135;0.947274;0.965142;0.977053;0.947591;0.946868;0.961113;0.976458;0.956867;0.951522;0.966137;0.975854;0.948398;0.947394;0.963383;0.975684;0.954595;0.950941 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845472;25693856;51380384;12845216;12845216 | 25821280;51401568;12767328;12869984;25812576;51403776;12778624;12833504;25811424;51374048;12832096;12825600;25808512;51412352;12777568;12836064;25821536;51383008;12807136;12876288 |
28 | InceptionV1/InceptionV1/Mixed_3b/concat | ConcatV2 | [[128 256 28 28]] | 339 | 122028032 | 122028032 | 251267584 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 58.45 | 0 | 21409333.33 | 21460242.67 | 95.90 | 0.00 | 0.00 | true | 0.968322;0.976026;0.949135;0.947274;0.965142;0.977053;0.947591;0.946868;0.961113;0.976458;0.956867;0.951522;0.966137;0.975854;0.948398;0.947394;0.963383;0.975684;0.954595;0.950941 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845472;25693856;51380384;12845216;12845216 | 25821280;51401568;12767328;12869984;25812576;51403776;12778624;12833504;25811424;51374048;12832096;12825600;25808512;51412352;12777568;12836064;25821536;51383008;12807136;12876288 |
28 | InceptionV1/InceptionV1/Mixed_3b/concat | ConcatV2 | [[128 256 28 28]] | 339 | 122028032 | 122028032 | 251267584 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 58.36 | 0 | 21409333.33 | 21460242.67 | 95.90 | 0.00 | 0.00 | true | 0.968322;0.976026;0.949135;0.947274;0.965142;0.977053;0.947591;0.946868;0.961113;0.976458;0.956867;0.951522;0.966137;0.975854;0.948398;0.947394;0.963383;0.975684;0.954595;0.950941 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845216;25691552;51380384;12845216;12845472;25693856;51380384;12845216;12845216 | 25821280;51401568;12767328;12869984;25812576;51403776;12778624;12833504;25811424;51374048;12832096;12825600;25808512;51412352;12777568;12836064;25821536;51383008;12807136;12876288 |
29 | InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 256 28 28]] | 276.667 | 122028032 | 0 | 148507136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 255.67 | 0 | 102760608.00 | 102672288.00 | 97.20 | 0.00 | 0.00 | true | 0.971337;0.971187;0.973249;0.971440;0.971890 | 0;0;0;0;0 | 102655328;102685824;102675712;102697600;102628832 | 102760608;102760864;102760608;102760608;102760608 |
30 | InceptionV1/InceptionV1/Mixed_3c/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 256 28 28]] | 648.667 | 102760448 | 102760448 | 251267584 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 607.33 | 25690112 | 1070549.33 | 1322101.33 | 70.70 | 10.74 | 42.30 | true | 0.706779;0.706953;0.706951;0.706893;0.706888 | 25690112;25690112;25690112;25690112;25690112 | 3211392;128;128;3214464;128 | 3963616;1280;1280;3964096;1408 |
31 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 292.667 | 12845056 | 12882688 | 264112640 | GPU_0_bfc | 37632 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 191.00 | 1650589696 | 102837866.67 | 13090912.00 | 19.20 | 14.24 | 8641.83 | true | 0.193598;0.200017;0.191685;0.191572;0.191353 | 1650589696;1650589696;1650589696;1650589696;1650589696 | 102837056;102837696;102837952;102838272;102837952 | 13104224;13074848;13100704;13085664;13086368 |
31 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 292.667 | 12845056 | 12882688 | 264112640 | GPU_0_bfc | 37632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 35008.00 | 30304.00 | 41.90 | 0.00 | 0.00 | true | 0.420518;0.418187;0.419273;0.418593;0.419948 | 0;0;0;0;0 | 35008;35008;35008;35008;35008 | 30656;29600;28608;30656;30752 |
31 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 28 28]] | 292.667 | 12845056 | 12882688 | 264112640 | GPU_0_bfc | 37632 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 1632.00 | 2880.00 | 5.90 | 0.00 | 0.00 | true | 0.059051;0.059075;0.059064;0.059064;0.059060 | 0;0;0;0;0 | 2656;3392;2592;2112;3904 | 1632;1632;1632;1632;1632 |
32 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 64225280 | 64361216 | 328337920 | GPU_0_bfc | 135936 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 554.00 | 6602358784 | 62191936.00 | 15412533.33 | 24.00 | 85.08 | 11917.62 | false | 0.240047;0.240023;0.239840;0.241880;0.239502 | 6602358784;6602358784;6602358784;6602358784;6602358784 | 54662400;45014976;67547072;67554944;64366336 | 13556736;11174912;16752608;16763872;15928256 |
32 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 64225280 | 64361216 | 328337920 | GPU_0_bfc | 135936 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 131264.00 | 1024.00 | 41.80 | 0.00 | 0.00 | true | 0.418079;0.419193;0.417886;0.417930;0.419882 | 0;0;0;0;0 | 131264;131264;131264;131264;131264 | 0;2176;768;1664;640 |
32 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 64225280 | 64361216 | 328337920 | GPU_0_bfc | 135936 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 1664.00 | 5.90 | 0.00 | 0.00 | true | 0.059287;0.059299;0.059295;0.059299;0.059289 | 0;0;0;0;0 | 96;96;96;96;96 | 1664;1664;1664;1664;1664 |
33 | InceptionV1/InceptionV1/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 51380224 | 51516160 | 379718144 | GPU_0_bfc | 135936 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 536.00 | 6602358784 | 68570080.00 | 17418389.33 | 24.00 | 76.78 | 12317.83 | false | 0.240124;0.240411;0.240399;0.239130;0.240868 | 6602358784;6602358784;6602358784;6602358784;6602358784 | 73920928;67524288;70700640;61059072;67485312 | 17695936;17668064;17675872;16075744;16911232 |
33 | InceptionV1/InceptionV1/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 51380224 | 51516160 | 379718144 | GPU_0_bfc | 135936 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 131264.00 | 67626.67 | 42.40 | 0.00 | 0.00 | true | 0.420127;0.422467;0.427300;0.426154;0.423422 | 0;0;0;0;0 | 136384;131264;131264;131264;131264 | 58240;63360;73088;78144;66432 |
33 | InceptionV1/InceptionV1/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 28 28]] | 640 | 51380224 | 51516160 | 379718144 | GPU_0_bfc | 135936 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 4373.33 | 5.90 | 0.00 | 0.00 | true | 0.059354;0.059353;0.059339;0.059341;0.059375 | 0;0;0;0;0 | 96;96;96;96;96 | 3712;3712;3392;6144;5696 |
34 | InceptionV1/InceptionV1/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 394.667 | 25690112 | 25760512 | 283380224 | GPU_0_bfc | 70400 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 298.33 | 3301179392 | 102896960.00 | 25699242.67 | 22.90 | 25.67 | 11065.42 | false | 0.229656;0.229864;0.229007;0.228707;0.229675 | 3301179392;3301179392;3301179392;3301179392;3301179392 | 25703200;25758624;25675520;25602944;25719008 | 102880448;102920640;102896192;102900736;102893952 |
34 | InceptionV1/InceptionV1/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 394.667 | 25690112 | 25760512 | 283380224 | GPU_0_bfc | 70400 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 65728.00 | 69312.00 | 42.30 | 0.00 | 0.00 | true | 0.418635;0.424666;0.419638;0.426721;0.425354 | 0;0;0;0;0 | 69952;69568;66752;73408;68416 | 65728;65728;65728;65728;65728 |
34 | InceptionV1/InceptionV1/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 28 28]] | 394.667 | 25690112 | 25760512 | 283380224 | GPU_0_bfc | 70400 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 4565.33 | 5.90 | 0.00 | 0.00 | true | 0.059283;0.059285;0.059294;0.059304;0.059288 | 0;0;0;0;0 | 4608;4288;5184;4032;4800 | 96;96;96;96;96 |
35 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 32 28 28]] | 61 | 12845056 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 35.33 | 3211264 | 12845781.33 | 12457440.00 | 46.80 | 0.13 | 90.89 | true | 0.469520;0.466342;0.468589;0.467646;0.468813 | 3211264;3211264;3211264;3211264;3211264 | 12845952;12845696;12845952;12845696;12845696 | 12436640;12434912;12500768;12513056;12432864 |
36 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 28 28]] | 149.667 | 64225280 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 131.33 | 12845056 | 51380992.00 | 51398304.00 | 49.10 | 0.12 | 97.81 | true | 0.491330;0.490427;0.490257;0.489950;0.490894 | 12845056;12845056;12845056;12845056;12845056 | 51380992;51380992;51380992;51380992;51380992 | 51416864;51402912;51387168;51381920;51404832 |
37 | InceptionV1/InceptionV1/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 28 28]] | 149.333 | 51380224 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 132.00 | 12845056 | 51380992.00 | 51393866.67 | 49.10 | 0.12 | 97.31 | true | 0.490538;0.490688;0.490585;0.490627;0.490894 | 12845056;12845056;12845056;12845056;12845056 | 51387552;51404448;51379584;51399968;51394080 | 51380992;51380992;51381504;51380992;51380992 |
38 | InceptionV1/InceptionV1/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 28 28]] | 85.667 | 25690112 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 67.67 | 6422528 | 25690624.00 | 25667264.00 | 48.20 | 0.13 | 94.91 | true | 0.482920;0.481741;0.482386;0.482553;0.481928 | 6422528;6422528;6422528;6422528;6422528 | 25690624;25690624;25690624;25690624;25690624 | 25664672;25662368;25674016;25667232;25669888 |
39 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 32 28 28]] | 52.667 | 12845056 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 35.00 | 0 | 12845472.00 | 12738880.00 | 90.10 | 0.00 | 0.00 | true | 0.902311;0.895232;0.908535;0.904283;0.889501 | 0;0;0;0;0 | 12845472;12848544;12845472;12845472;12845472 | 12759296;12736896;12730624;12740832;12738912 |
40 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 128 28 28]] | 146 | 64225280 | 0 | 180619776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 129.33 | 0 | 51380384.00 | 51457141.33 | 96.30 | 0.00 | 0.00 | true | 0.962750;0.963946;0.964707;0.962127;0.963642 | 0;0;0;0;0 | 51380384;51380384;51380384;51380384;51380384 | 51438048;51467008;51451136;51453280;51467904 |
41 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 509.667 | 38535168 | 199098368 | 219154944 | GPU_0_bfc | 160563200 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 159.33 | 1362001920 | 30847349.33 | 54005024.00 | 16.70 | 16.05 | 8548.15 | true | 0.165681;0.162168;0.167892;0.168754;0.166242 | 1362001920;1362001920;1362001920;1362001920;1362001920 | 30955808;30797472;30797088;30931616;30812960 | 54038880;53979264;53939552;54027264;54008544 |
41 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 509.667 | 38535168 | 199098368 | 219154944 | GPU_0_bfc | 160563200 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 142.67 | 264241152 | 53305301.33 | 41638720.00 | 46.50 | 2.78 | 1852.15 | true | 0.465620;0.469874;0.464643;0.460894;0.464602 | 264241152;264241152;264241152;264241152;264241152 | 53297280;53312128;53313088;53303552;53300224 | 41543584;41557088;41637120;41723392;41721952 |
41 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 509.667 | 38535168 | 199098368 | 219154944 | GPU_0_bfc | 160563200 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 56.00 | 92471296 | 13038218.67 | 17819285.33 | 43.00 | 3.00 | 1651.27 | true | 0.434524;0.433475;0.431139;0.426344;0.425891 | 92471296;92471296;92471296;92471296;92471296 | 17785248;17844160;17931168;17803648;17810048 | 13041440;13023136;13051680;13050080;13008928 |
41 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 509.667 | 38535168 | 199098368 | 219154944 | GPU_0_bfc | 160563200 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 31.33 | 69353472 | 21600.00 | 12844906.67 | 41.20 | 5.39 | 2213.43 | true | 0.410301;0.415833;0.413126;0.409228;0.413594 | 69353472;69353472;69353472;69353472;69353472 | 19424;22944;19104;22688;22688 | 12864224;12849760;12836448;12848512;12829152 |
41 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 96 28 28]] | 509.667 | 38535168 | 199098368 | 219154944 | GPU_0_bfc | 160563200 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 110784.00 | 148992.00 | 44.10 | 0.00 | 0.00 | true | 0.442983;0.440643;0.440779;0.442311;0.441071 | 0;0;0;0;0 | 142336;139520;150144;155520;154496 | 110784;110784;110784;112832;110784 |
42 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 28 28]] | 1634.333 | 77070336 | 363192320 | 283380224 | GPU_0_bfc | 286121984 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 884.00 | 10545070080 | 14325856.00 | 18111722.67 | 16.90 | 325.09 | 11928.81 | false | 0.168810;0.171423;0.165801;0.168396;0.169532 | 10545070080;10545070080;10545070080;10545070080;10545070080 | 19771456;13065984;15434400;19394464;19506304 | 15734848;10121696;12067104;15435104;15475360 |
42 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 28 28]] | 1634.333 | 77070336 | 363192320 | 283380224 | GPU_0_bfc | 286121984 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 266.67 | 528482304 | 106955114.67 | 82806218.67 | 47.40 | 2.78 | 1981.81 | true | 0.473481;0.474087;0.473175;0.474282;0.470954 | 528482304;528482304;528482304;528482304;528482304 | 106961696;106957216;106958944;106949184;106942240 | 82816064;82816384;82771584;82786208;82847840 |
42 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 28 28]] | 1634.333 | 77070336 | 363192320 | 283380224 | GPU_0_bfc | 286121984 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 184.33 | 369885184 | 51743221.33 | 71333429.33 | 47.30 | 3.01 | 2006.61 | true | 0.472276;0.472808;0.473199;0.473217;0.471829 | 369885184;369885184;369885184;369885184;369885184 | 71152512;71211488;71386880;71401920;71467264 | 51749856;51744992;51720032;51772128;51734816 |
42 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 28 28]] | 1634.333 | 77070336 | 363192320 | 283380224 | GPU_0_bfc | 286121984 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 168.00 | 554827776 | 792181.33 | 106276469.33 | 46.80 | 5.18 | 3302.55 | true | 0.467230;0.468205;0.467518;0.469193;0.467814 | 554827776;554827776;554827776;554827776;554827776 | 106306880;106370976;106292928;106200960;106229600 | 791904;795744;791712;792928;790112 |
42 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 192 28 28]] | 1634.333 | 77070336 | 363192320 | 283380224 | GPU_0_bfc | 286121984 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.33 | 0 | 885184.00 | 1197066.67 | 44.70 | 0.00 | 0.00 | true | 0.446722;0.449862;0.446711;0.445360;0.448747 | 0;0;0;0;0 | 1159040;1046048;1219552;1244800;1212608 | 885184;885184;885216;885184;885184 |
43 | InceptionV1/InceptionV1/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 96 28 28]] | 126.333 | 38535168 | 0 | 219154944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 100.67 | 9633792 | 38536874.67 | 38641013.33 | 48.80 | 0.12 | 95.70 | true | 0.487910;0.488540;0.487712;0.487871;0.488653 | 9633792;9633792;9633792;9633792;9633792 | 38536832;38537088;38536832;38536960;38536832 | 38647040;38643360;38629600;38643456;38636224 |
44 | InceptionV1/InceptionV1/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 192 28 28]] | 213.333 | 77070336 | 0 | 219154944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 194.33 | 19267584 | 77071360.00 | 77094165.33 | 49.60 | 0.12 | 99.15 | true | 0.495559;0.495699;0.496167;0.495951;0.495606 | 19267584;19267584;19267584;19267584;19267584 | 77071360;77071360;77071360;77071360;77071360 | 77084544;77100832;77100704;77087392;77094400 |
45 | InceptionV1/InceptionV1/Mixed_3c/concat | ConcatV2 | [[128 480 28 28]] | 568 | 192675840 | 192675840 | 411830784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 131.36 | 0 | 46029642.67 | 46060240.00 | 97.40 | 0.00 | 0.00 | true | 0.976360;0.978781;0.973333;0.963489;0.974706;0.978804;0.973729;0.962767;0.975414;0.979319;0.973621;0.964848;0.975692;0.979331;0.974612;0.962029;0.976434;0.978578;0.974280;0.965798 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 51381664;77070496;38545312;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272 | 51497984;77077088;38501088;25681248;51496416;77086816;38488416;25682272;51477344;77090144;38499168;25686240;51504608;77084640;38494592;25656544;51493632;77101568;38506304;25652832 |
45 | InceptionV1/InceptionV1/Mixed_3c/concat | ConcatV2 | [[128 480 28 28]] | 568 | 192675840 | 192675840 | 411830784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 125.45 | 0 | 46029642.67 | 46060240.00 | 97.40 | 0.00 | 0.00 | true | 0.976360;0.978781;0.973333;0.963489;0.974706;0.978804;0.973729;0.962767;0.975414;0.979319;0.973621;0.964848;0.975692;0.979331;0.974612;0.962029;0.976434;0.978578;0.974280;0.965798 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 51497984;77077088;38501088;25681248;51496416;77086816;38488416;25682272;51477344;77090144;38499168;25686240;51504608;77084640;38494592;25656544;51493632;77101568;38506304;25652832 | 51381664;77070496;38545312;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272 |
45 | InceptionV1/InceptionV1/Mixed_3c/concat | ConcatV2 | [[128 480 28 28]] | 568 | 192675840 | 192675840 | 411830784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 122.18 | 0 | 46029642.67 | 46060240.00 | 97.40 | 0.00 | 0.00 | true | 0.976360;0.978781;0.973333;0.963489;0.974706;0.978804;0.973729;0.962767;0.975414;0.979319;0.973621;0.964848;0.975692;0.979331;0.974612;0.962029;0.976434;0.978578;0.974280;0.965798 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 51381664;77070496;38545312;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272 | 51497984;77077088;38501088;25681248;51496416;77086816;38488416;25682272;51477344;77090144;38499168;25686240;51504608;77084640;38494592;25656544;51493632;77101568;38506304;25652832 |
45 | InceptionV1/InceptionV1/Mixed_3c/concat | ConcatV2 | [[128 480 28 28]] | 568 | 192675840 | 192675840 | 411830784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 119.27 | 0 | 46029642.67 | 46060240.00 | 97.40 | 0.00 | 0.00 | true | 0.976360;0.978781;0.973333;0.963489;0.974706;0.978804;0.973729;0.962767;0.975414;0.979319;0.973621;0.964848;0.975692;0.979331;0.974612;0.962029;0.976434;0.978578;0.974280;0.965798 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 51381664;77070496;38545312;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272;51381664;77070496;38535328;25690272 | 51497984;77077088;38501088;25681248;51496416;77086816;38488416;25682272;51477344;77090144;38499168;25686240;51504608;77084640;38494592;25656544;51493632;77101568;38506304;25652832 |
46 | InceptionV1/InceptionV1/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 480 28 28]] | 500.333 | 192675840 | 0 | 219154944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 478.33 | 0 | 121426944.00 | 124380874.67 | 97.80 | 0.00 | 0.00 | true | 0.978165;0.979091;0.977620;0.977939;0.976432 | 0;0;0;0;0 | 123433024;141496384;117411968;123435840;117411904 | 126384480;141429824;120364064;126394080;114348896 |
47 | InceptionV1/InceptionV1/MaxPool_4a_3x3/MaxPool | MaxPool | [[128 480 14 14]] | 485.667 | 48168960 | 48168960 | 267323904 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 444.00 | 12042240 | 165585322.67 | 52078389.33 | 62.80 | 0.06 | 27.12 | true | 0.627071;0.628052;0.628132;0.627814;0.628037 | 12042240;12042240;12042240;12042240;12042240 | 159563072;165584960;159564160;192681152;171606848 | 50830848;51804672;48873408;60424736;53599648 |
48 | InceptionV1/InceptionV1/Mixed_4b/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 480 14 14]] | 326.667 | 48168960 | 48168960 | 122817024 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 296.33 | 12042240 | 48169152.00 | 56692405.33 | 61.50 | 0.11 | 40.64 | true | 0.614900;0.614782;0.614776;0.614720;0.614876 | 12042240;12042240;12042240;12042240;12042240 | 48169152;48169152;48169152;48169152;48169152 | 56639232;56653792;56784192;56544352;56795520 |
49 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 14 14]] | 256.667 | 1605632 | 1637632 | 124422656 | GPU_0_bfc | 32000 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 157.67 | 1544617984 | 48721941.33 | 3383200.00 | 15.40 | 29.64 | 9796.71 | false | 0.154168;0.154427;0.153839;0.153431;0.154706 | 1544617984;1544617984;1544617984;1544617984;1544617984 | 48884256;48882272;49054688;48399296;48356128 | 3381568;3412896;3382304;3355424;3385728 |
49 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 14 14]] | 256.667 | 1605632 | 1637632 | 124422656 | GPU_0_bfc | 32000 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 32960.00 | 17173.33 | 43.10 | 0.00 | 0.00 | true | 0.430565;0.429944;0.429990;0.433905;0.431168 | 0;0;0;0;0 | 32960;32960;33216;32960;32960 | 19424;17568;16928;14816;17024 |
49 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 16 14 14]] | 256.667 | 1605632 | 1637632 | 124422656 | GPU_0_bfc | 32000 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 608.00 | 981.33 | 5.60 | 0.00 | 0.00 | true | 0.056033;0.055926;0.056794;0.056109;0.055854 | 0;0;0;0;0 | 608;608;608;608;608 | 1152;704;896;1056;992 |
50 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 14 14]] | 327 | 9633792 | 9819392 | 134056448 | GPU_0_bfc | 185600 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 224.33 | 2316926976 | 106510218.67 | 8780938.67 | 20.80 | 20.10 | 10328.07 | false | 0.207486;0.208627;0.208024;0.209150;0.207909 | 2316926976;2316926976;2316926976;2316926976;2316926976 | 8754528;8798624;8783296;8793856;8765664 | 106974240;106066560;107574400;106489856;104505024 |
50 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 14 14]] | 327 | 9633792 | 9819392 | 134056448 | GPU_0_bfc | 185600 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 184512.00 | 8576.00 | 44.20 | 0.00 | 0.00 | true | 0.442568;0.441960;0.442055;0.441747;0.444486 | 0;0;0;0;0 | 13376;7072;5280;3840;15968 | 184512;184512;189632;184512;184512 |
50 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 96 14 14]] | 327 | 9633792 | 9819392 | 134056448 | GPU_0_bfc | 185600 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 0.00 | 5.80 | 0.00 | 0.00 | true | 0.057907;0.057917;0.057947;0.057949;0.057910 | 0;0;0;0;0 | 96;96;96;96;96 | 128;0;0;0;0 |
51 | InceptionV1/InceptionV1/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 14 14]] | 498.667 | 19267584 | 19637504 | 153324032 | GPU_0_bfc | 369920 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 397.67 | 4633853952 | 145299285.33 | 20909258.67 | 23.00 | 27.88 | 11652.60 | false | 0.229003;0.230050;0.230759;0.230059;0.229241 | 4633853952;4633853952;4633853952;4633853952;4633853952 | 146577376;144906432;145328384;145663040;139540320 | 20821184;20805632;20940704;20979296;20965888 |
51 | InceptionV1/InceptionV1/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 14 14]] | 498.667 | 19267584 | 19637504 | 153324032 | GPU_0_bfc | 369920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 373354.67 | 350698.67 | 45.80 | 0.00 | 0.00 | true | 0.458360;0.457402;0.458348;0.456873;0.459938 | 0;0;0;0;0 | 373856;373504;373216;372544;373344 | 365472;352512;328000;344192;355392 |
51 | InceptionV1/InceptionV1/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 14 14]] | 498.667 | 19267584 | 19637504 | 153324032 | GPU_0_bfc | 369920 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 138.67 | 1344.00 | 5.80 | 0.00 | 0.00 | true | 0.057959;0.057994;0.057959;0.058015;0.057957 | 0;0;0;0;0 | 192;128;128;96;160 | 1504;1152;1216;5824;1312 |
52 | InceptionV1/InceptionV1/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 250.333 | 6422528 | 6546688 | 111577600 | GPU_0_bfc | 124160 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 156.00 | 1544617984 | 49258784.00 | 6440277.33 | 15.50 | 27.73 | 9901.40 | false | 0.154392;0.155239;0.154824;0.154013;0.154396 | 1544617984;1544617984;1544617984;1544617984;1544617984 | 50270944;49449792;49627360;48699200;48594528 | 6433696;6449440;6437696;6352224;6460928 |
52 | InceptionV1/InceptionV1/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 250.333 | 6422528 | 6546688 | 111577600 | GPU_0_bfc | 124160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 124000.00 | 73802.67 | 43.50 | 0.00 | 0.00 | true | 0.433568;0.434331;0.435136;0.434400;0.435775 | 0;0;0;0;0 | 75392;66656;79360;59008;90464 | 124512;123968;123904;123872;124128 |
52 | InceptionV1/InceptionV1/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 250.333 | 6422528 | 6546688 | 111577600 | GPU_0_bfc | 124160 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 810.67 | 1450.67 | 5.80 | 0.00 | 0.00 | true | 0.057924;0.058096;0.057917;0.057921;0.057949 | 0;0;0;0;0 | 128;2144;128;160;5472 | 1408;1728;704;1216;4576 |
53 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 16 14 14]] | 32.333 | 1605632 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 9.00 | 401408 | 1606602.67 | 1326656.00 | 45.00 | 0.14 | 44.60 | true | 0.450422;0.449461;0.450287;0.447803;0.449869 | 401408;401408;401408;401408;401408 | 1607040;1606432;1606496;1606720;1606592 | 1318272;1314400;1308224;1355552;1347296 |
54 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 96 14 14]] | 47.333 | 9633792 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 29.00 | 2408448 | 9637749.33 | 9919466.67 | 45.80 | 0.12 | 83.05 | true | 0.457711;0.458347;0.457122;0.456601;0.460770 | 2408448;2408448;2408448;2408448;2408448 | 9900928;9925280;9908096;9925024;9941472 | 9638592;9637568;9638880;9636544;9637088 |
55 | InceptionV1/InceptionV1/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 192 14 14]] | 71 | 19267584 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 53.33 | 4816896 | 19268544.00 | 19311648.00 | 47.50 | 0.12 | 90.32 | true | 0.475488;0.475102;0.472282;0.475716;0.475458 | 4816896;4816896;4816896;4816896;4816896 | 19268608;19268544;19268544;19268544;19268544 | 19313696;19310368;19322912;19309056;19310880 |
56 | InceptionV1/InceptionV1/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 37.333 | 6422528 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.33 | 1605632 | 6423040.00 | 6389237.33 | 45.30 | 0.13 | 78.97 | true | 0.453670;0.454227;0.451656;0.452313;0.454969 | 1605632;1605632;1605632;1605632;1605632 | 6423040;6423040;6423040;6423040;6423040 | 6402976;6386464;6387872;6393376;6377600 |
57 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 16 14 14]] | 24.333 | 1605632 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 7.67 | 0 | 1605728.00 | 1640277.33 | 54.20 | 0.00 | 0.00 | true | 0.534908;0.550344;0.538406;0.537925;0.549337 | 0;0;0;0;0 | 1605728;1605728;1605728;1607520;1605728 | 1627776;1639296;1643776;1642368;1639168 |
58 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 96 14 14]] | 42.667 | 9633792 | 0 | 63408640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 26.67 | 0 | 9633888.00 | 9466549.33 | 88.80 | 0.00 | 0.00 | true | 0.896372;0.885539;0.883790;0.886905;0.890404 | 0;0;0;0;0 | 9633888;9633888;9633888;9633888;9633888 | 9464288;9458752;9461600;9473760;9512288 |
59 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 48 14 14]] | 154.333 | 4816896 | 4937984 | 68225536 | GPU_0_bfc | 121088 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 51.00 | 300941312 | 1724373.33 | 5673472.00 | 23.60 | 40.68 | 5900.81 | false | 0.235415;0.235491;0.237856;0.237036;0.235929 | 300941312;300941312;300941312;300941312;300941312 | 1722752;1724608;1724736;1723776;1728960 | 5697664;5681920;5686784;5651712;5646912 |
59 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 48 14 14]] | 154.333 | 4816896 | 4937984 | 68225536 | GPU_0_bfc | 121088 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.33 | 59392 | 6762.67 | 85664.00 | 6.20 | 0.64 | 9.38 | true | 0.062437;0.062440;0.062437;0.062436;0.062434 | 59392;59392;59392;59392;59392 | 6720;6720;6720;6848;6848 | 80032;87456;82976;96416;86560 |
59 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 48 14 14]] | 154.333 | 4816896 | 4937984 | 68225536 | GPU_0_bfc | 121088 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 27840.00 | 75904.00 | 42.70 | 0.00 | 0.00 | true | 0.430465;0.427406;0.427403;0.427125;0.427376 | 0;0;0;0;0 | 27840;27840;27840;32192;27840 | 71808;78208;80896;77696;69504 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 276.33 | 3195666432 | 36780021.33 | 31182304.00 | 23.40 | 47.02 | 11564.55 | false | 0.232723;0.234354;0.234102;0.234738;0.233250 | 3195666432;3195666432;3195666432;3195666432;3195666432 | 36929632;36712672;36830944;36719008;36790112 | 31184992;31187296;31174624;31263360;31171808 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 90.00 | 123002880 | 30512234.67 | 23028832.00 | 69.40 | 2.30 | 1366.70 | true | 0.694908;0.691636;0.693412;0.692907;0.695323 | 123002880;123002880;123002880;123002880;123002880 | 30517600;30481696;30536576;30519360;30499744 | 23050816;23095424;23017664;23008416;23018016 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 37.57 | 71092224 | 5043328.00 | 18228768.00 | 63.00 | 3.05 | 1892.21 | true | 0.637584;0.616009;0.639194;0.616935;0.639886;0.626639;0.638905;0.624276;0.635427;0.616267 | 88018944;54165504;88018944;54165504;88018944;54165504;88018944;54165504;88018944;54165504 | 202464;9989568;198112;9868928;198816;9911680;197536;10031040;198528;9879552 | 22271552;14161920;22292384;14109440;22257760;14202848;22230464;14186368;22265920;14229248 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 37.14 | 71092224 | 5043328.00 | 18228768.00 | 63.00 | 3.05 | 1914.01 | true | 0.637584;0.616009;0.639194;0.616935;0.639886;0.626639;0.638905;0.624276;0.635427;0.616267 | 88018944;54165504;88018944;54165504;88018944;54165504;88018944;54165504;88018944;54165504 | 22271552;14161920;22292384;14109440;22257760;14202848;22230464;14186368;22265920;14229248 | 202464;9989568;198112;9868928;198816;9911680;197536;10031040;198528;9879552 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 36.00 | 0 | 1813.33 | 814954.67 | 4.40 | 0.00 | 0.00 | true | 0.043884;0.043913;0.054397;0.043977;0.043638 | 0;0;0;0;0 | 1856;1856;1760;1760;1824 | 807232;807936;817440;830944;819488 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 719040.00 | 882730.67 | 45.10 | 0.00 | 0.00 | true | 0.450147;0.455834;0.452512;0.448793;0.449880 | 0;0;0;0;0 | 721312;719040;719040;718912;719040 | 887072;894656;883936;872768;877184 |
60 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 208 14 14]] | 618.667 | 20873216 | 96628736 | 87493120 | GPU_0_bfc | 75755520 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 4.00 | 0 | 1216.00 | 12160.00 | 2.70 | 0.00 | 0.00 | true | 0.026621;0.026615;0.026582;0.026660;0.026574 | 0;0;0;0;0 | 1216;1216;1216;1216;1216 | 24192;12160;12416;11904;9472 |
61 | InceptionV1/InceptionV1/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 48 14 14]] | 40 | 4816896 | 0 | 77859328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 16.33 | 1204224 | 4818720.00 | 4805781.33 | 44.70 | 0.13 | 73.73 | true | 0.454951;0.450074;0.448713;0.442205;0.443503 | 1204224;1204224;1204224;1204224;1204224 | 4818592;4819200;4825536;4818368;4818368 | 4822240;4798976;4804544;4808032;4804768 |
62 | InceptionV1/InceptionV1/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 208 14 14]] | 74.333 | 20873216 | 0 | 77859328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 57.00 | 5218304 | 20874240.00 | 20892266.67 | 47.80 | 0.12 | 91.55 | true | 0.479090;0.477046;0.478634;0.478867;0.477994 | 5218304;5218304;5218304;5218304;5218304 | 20873632;20891424;20893344;20892032;20900288 | 20874240;20874240;20874240;20874240;20874304 |
63 | InceptionV1/InceptionV1/Mixed_4b/concat | ConcatV2 | [[128 512 14 14]] | 199.333 | 75464704 | 75464704 | 153324032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 38.64 | 0 | 12845200.00 | 12889400.00 | 94.50 | 0.00 | 0.00 | true | 0.959920;0.954844;0.934114;0.934933;0.957993;0.959135;0.927196;0.931489;0.955303;0.960860;0.930321;0.934702;0.953444;0.954611;0.932076;0.931398;0.957556;0.962073;0.926817;0.933170 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 19347808;20889600;4755904;6478560;19363040;20891616;4746048;6456064;19340384;20883072;4755680;6455776;19369216;20863584;4738368;6457184;19335008;20900224;4755648;6450272 | 19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688 |
63 | InceptionV1/InceptionV1/Mixed_4b/concat | ConcatV2 | [[128 512 14 14]] | 199.333 | 75464704 | 75464704 | 153324032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 38.18 | 0 | 12845200.00 | 12889400.00 | 94.50 | 0.00 | 0.00 | true | 0.959920;0.954844;0.934114;0.934933;0.957993;0.959135;0.927196;0.931489;0.955303;0.960860;0.930321;0.934702;0.953444;0.954611;0.932076;0.931398;0.957556;0.962073;0.926817;0.933170 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688 | 19347808;20889600;4755904;6478560;19363040;20891616;4746048;6456064;19340384;20883072;4755680;6455776;19369216;20863584;4738368;6457184;19335008;20900224;4755648;6450272 |
63 | InceptionV1/InceptionV1/Mixed_4b/concat | ConcatV2 | [[128 512 14 14]] | 199.333 | 75464704 | 75464704 | 153324032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 35.18 | 0 | 12845200.00 | 12889400.00 | 94.50 | 0.00 | 0.00 | true | 0.959920;0.954844;0.934114;0.934933;0.957993;0.959135;0.927196;0.931489;0.955303;0.960860;0.930321;0.934702;0.953444;0.954611;0.932076;0.931398;0.957556;0.962073;0.926817;0.933170 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 19347808;20889600;4755904;6478560;19363040;20891616;4746048;6456064;19340384;20883072;4755680;6455776;19369216;20863584;4738368;6457184;19335008;20900224;4755648;6450272 | 19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688 |
63 | InceptionV1/InceptionV1/Mixed_4b/concat | ConcatV2 | [[128 512 14 14]] | 199.333 | 75464704 | 75464704 | 153324032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 34.82 | 0 | 12845200.00 | 12889400.00 | 94.50 | 0.00 | 0.00 | true | 0.959920;0.954844;0.934114;0.934933;0.957993;0.959135;0.927196;0.931489;0.955303;0.960860;0.930321;0.934702;0.953444;0.954611;0.932076;0.931398;0.957556;0.962073;0.926817;0.933170 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 19347808;20889600;4755904;6478560;19363040;20891616;4746048;6456064;19340384;20883072;4755680;6455776;19369216;20863584;4738368;6457184;19335008;20900224;4755648;6450272 | 19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422624;19267744;20873376;4817056;6422688 |
64 | InceptionV1/InceptionV1/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 512 14 14]] | 147.333 | 75464704 | 0 | 101943808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 129.00 | 0 | 51380640.00 | 51273248.00 | 96.30 | 0.00 | 0.00 | true | 0.960733;0.963574;0.964260;0.963311;0.963012 | 0;0;0;0;0 | 51380640;51380640;51380384;51380640;51380640 | 51247840;51259136;51285600;51295584;51275008 |
65 | InceptionV1/InceptionV1/Mixed_4c/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 512 14 14]] | 353 | 51380224 | 51380224 | 153324032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 318.00 | 12845056 | 51385792.00 | 61966261.33 | 61.40 | 0.11 | 40.39 | true | 0.613707;0.613793;0.613854;0.614012;0.613844 | 12845056;12845056;12845056;12845056;12845056 | 62047904;61954240;61975296;61969248;61893088 | 51385792;51386048;51385792;51385792;51385792 |
66 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 155732480 | GPU_0_bfc | 50432 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 164.00 | 1647378432 | 52488437.33 | 3748629.33 | 15.80 | 29.29 | 10044.99 | false | 0.157752;0.157765;0.157956;0.158141;0.157800 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 52544288;52504832;52582624;51675968;52416192 | 3741920;3760992;3764608;3742976;3714112 |
66 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 155732480 | GPU_0_bfc | 50432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 49344.00 | 22901.33 | 44.90 | 0.00 | 0.00 | true | 0.447249;0.448678;0.443958;0.454250;0.450062 | 0;0;0;0;0 | 49344;49344;49344;49344;54976 | 21664;23520;24000;21728;23456 |
66 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 155732480 | GPU_0_bfc | 50432 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 352.00 | 842.67 | 5.90 | 0.00 | 0.00 | true | 0.059477;0.059460;0.059452;0.059448;0.059462 | 0;0;0;0;0 | 352;352;352;352;352 | 992;416;864;896;768 |
67 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 520.667 | 11239424 | 11470080 | 166971904 | GPU_0_bfc | 230656 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 428.00 | 3294756864 | 100284981.33 | 12307114.67 | 22.40 | 29.26 | 7698.03 | false | 0.224247;0.221386;0.224521;0.224225;0.225846 | 3294756864;3294756864;3294756864;3294756864;3294756864 | 103955552;93992352;104397952;86018528;102907040 | 12546176;11764736;12610432;10683936;12695936 |
67 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 520.667 | 11239424 | 11470080 | 166971904 | GPU_0_bfc | 230656 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 229568.00 | 4394.67 | 45.60 | 0.00 | 0.00 | true | 0.456147;0.457196;0.452688;0.455160;0.459303 | 0;0;0;0;0 | 5664;3584;448;3936;5984 | 229568;229568;229568;229568;229568 |
67 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 520.667 | 11239424 | 11470080 | 166971904 | GPU_0_bfc | 230656 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 0.00 | 5.80 | 0.00 | 0.00 | true | 0.057966;0.058001;0.057973;0.057996;0.057971 | 0;0;0;0;0 | 96;96;96;96;96 | 128;0;0;0;0 |
68 | InceptionV1/InceptionV1/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 533.667 | 18464768 | 18793728 | 185436672 | GPU_0_bfc | 328960 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 440.00 | 4118446080 | 140187573.33 | 13641248.00 | 20.50 | 26.77 | 9360.10 | false | 0.204482;0.203974;0.205382;0.204806;0.205266 | 4118446080;4118446080;4118446080;4118446080;4118446080 | 137284416;138809792;144468512;133231840;168250816 | 13752704;13128512;13599264;13571776;15625088 |
68 | InceptionV1/InceptionV1/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 533.667 | 18464768 | 18793728 | 185436672 | GPU_0_bfc | 328960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 327893.33 | 36021.33 | 45.70 | 0.00 | 0.00 | true | 0.458955;0.456663;0.442359;0.455018;0.457976 | 0;0;0;0;0 | 327872;327936;327872;327968;327872 | 26432;11200;62304;44800;36832 |
68 | InceptionV1/InceptionV1/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 533.667 | 18464768 | 18793728 | 185436672 | GPU_0_bfc | 328960 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 298.67 | 5.80 | 0.00 | 0.00 | true | 0.057996;0.057994;0.058020;0.057971;0.058003 | 0;0;0;0;0 | 96;96;96;96;96 | 640;992;0;128;128 |
69 | InceptionV1/InceptionV1/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 259.333 | 6422528 | 6554880 | 116394496 | GPU_0_bfc | 132352 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 163.33 | 1647378432 | 52872672.00 | 6449866.67 | 15.80 | 27.77 | 10086.01 | false | 0.158068;0.158306;0.158409;0.158298;0.158240 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 6351456;6504992;6439808;6429504;6480288 | 53303904;52488768;52917440;51980384;53211808 |
69 | InceptionV1/InceptionV1/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 259.333 | 6422528 | 6554880 | 116394496 | GPU_0_bfc | 132352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 131968.00 | 41226.67 | 43.60 | 0.00 | 0.00 | true | 0.434635;0.439815;0.434667;0.448055;0.430404 | 0;0;0;0;0 | 27968;71072;45024;42592;36064 | 131552;132320;133664;132000;131584 |
69 | InceptionV1/InceptionV1/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 259.333 | 6422528 | 6554880 | 116394496 | GPU_0_bfc | 132352 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 117.33 | 992.00 | 5.80 | 0.00 | 0.00 | true | 0.057985;0.057996;0.058017;0.057969;0.058236 | 0;0;0;0;0 | 128;128;96;96;5280 | 480;1024;576;1376;2240 |
70 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 24 14 14]] | 34 | 2408448 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 10.00 | 602112 | 2410293.33 | 2217472.00 | 45.00 | 0.13 | 60.21 | true | 0.450935;0.449420;0.448429;0.449462;0.450372 | 602112;602112;602112;602112;602112 | 2410560;2410208;2410624;2410112;2409696 | 2219488;2205696;2217952;2230912;2214976 |
71 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 112 14 14]] | 51.333 | 11239424 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 32.67 | 2809856 | 11242378.67 | 11510336.00 | 45.90 | 0.12 | 86.02 | true | 0.454291;0.457412;0.460553;0.461816;0.459345 | 2809856;2809856;2809856;2809856;2809856 | 11242208;11242496;11242912;11241568;11242432 | 11509344;11517088;11519072;11504576;11502784 |
72 | InceptionV1/InceptionV1/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 160 14 14]] | 62.667 | 18464768 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 45.00 | 4014080 | 16057216.00 | 16051392.00 | 47.20 | 0.13 | 89.20 | true | 0.471722;0.472437;0.473778;0.471961;0.471245 | 4014080;4014080;4014080;4014080;4014080 | 16057216;16057216;16057216;16059264;16057216 | 16045568;16062240;16051744;16034592;16056864 |
73 | InceptionV1/InceptionV1/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 38.667 | 6422528 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.33 | 1605632 | 6423040.00 | 6384064.00 | 45.50 | 0.13 | 78.97 | true | 0.450194;0.453827;0.455722;0.456324;0.456229 | 1605632;1605632;1605632;1605632;1605632 | 6423040;6423040;6423040;6423040;6423040 | 6362144;6380416;6396416;6388640;6383136 |
74 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 24 14 14]] | 25.333 | 2408448 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 9.33 | 0 | 2409226.67 | 2392021.33 | 65.90 | 0.00 | 0.00 | true | 0.659724;0.650583;0.657053;0.659761;0.660084 | 0;0;0;0;0 | 2414432;2408544;2408544;2408544;2410592 | 2457312;2386048;2393344;2386048;2396672 |
75 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 112 14 14]] | 45.667 | 11239424 | 0 | 65014272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 30.33 | 0 | 11239541.33 | 11146250.67 | 89.30 | 0.00 | 0.00 | true | 0.895802;0.892725;0.890478;0.890751;0.895367 | 0;0;0;0;0 | 11239584;11239520;11239520;11239520;11239584 | 11120224;11141088;11124064;11208640;11173600 |
76 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 161.333 | 6422528 | 6631680 | 71436800 | GPU_0_bfc | 209152 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 59.67 | 440401920 | 2882154.67 | 7695658.67 | 23.50 | 41.63 | 7381.00 | false | 0.235843;0.236898;0.235056;0.235181;0.234417 | 440401920;440401920;440401920;440401920;440401920 | 7706656;7727712;7691264;7679424;7689056 | 2880736;2897184;2848928;2890272;2875456 |
76 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 161.333 | 6422528 | 6631680 | 71436800 | GPU_0_bfc | 209152 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.33 | 89088 | 2752.00 | 158592.00 | 6.20 | 0.55 | 14.07 | true | 0.062417;0.062419;0.062413;0.062409;0.062422 | 89088;89088;89088;89088;89088 | 3008;2752;2752;2752;2752 | 161664;162720;155776;158336;154880 |
76 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 161.333 | 6422528 | 6631680 | 71436800 | GPU_0_bfc | 209152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 55488.00 | 128725.33 | 42.90 | 0.00 | 0.00 | true | 0.431418;0.428447;0.432692;0.427154;0.427492 | 0;0;0;0;0 | 55488;55488;55488;55488;55488 | 128256;124032;134272;125056;132864 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 318.00 | 3724148736 | 46298826.67 | 33547392.00 | 23.50 | 46.64 | 11711.16 | false | 0.234263;0.234216;0.234609;0.234989;0.235319 | 3724148736;3724148736;3724148736;3724148736;3724148736 | 45965472;45440992;46442144;46488864;46649440 | 33523840;33499776;33576160;33542176;33600416 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 89.67 | 132464640 | 32997120.00 | 24797301.33 | 69.20 | 2.29 | 1477.30 | true | 0.692581;0.689676;0.691338;0.692493;0.692096 | 132464640;132464640;132464640;132464640;132464640 | 33000736;33004960;32985664;32972992;33014592 | 24802208;24827168;24801248;24788448;24786080 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 45.00 | 0 | 11946.67 | 940309.33 | 4.30 | 0.00 | 0.00 | true | 0.043444;0.043492;0.043120;0.043394;0.043138 | 0;0;0;0;0 | 12736;11680;11424;9312;13088 | 931776;936128;942464;942336;947552 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 44.14 | 86890496 | 6011802.67 | 22325274.67 | 64.80 | 3.07 | 1968.39 | true | 0.652634;0.645098;0.652507;0.644579;0.650894;0.643100;0.651200;0.640391;0.650169;0.635164 | 110587904;63193088;110587904;63193088;110587904;63193088;110587904;63193088;110587904;63193088 | 382784;11599488;396960;11702272;386656;11782464;383488;11757312;385568;11599872 | 28088448;16581408;28159968;16518368;28154880;16476160;28134432;16534336;28077536;16535488 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 43.86 | 86890496 | 6011802.67 | 22325274.67 | 64.80 | 3.07 | 1981.22 | true | 0.652634;0.645098;0.652507;0.644579;0.650894;0.643100;0.651200;0.640391;0.650169;0.635164 | 110587904;63193088;110587904;63193088;110587904;63193088;110587904;63193088;110587904;63193088 | 382784;11599488;396960;11702272;386656;11782464;383488;11757312;385568;11599872 | 28088448;16581408;28159968;16518368;28154880;16476160;28134432;16534336;28077536;16535488 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.67 | 0 | 906634.67 | 927338.67 | 44.30 | 0.00 | 0.00 | true | 0.442351;0.439472;0.442920;0.442701;0.443411 | 0;0;0;0;0 | 906368;906688;906528;906720;906688 | 925376;931424;928128;923872;928512 |
77 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 224 14 14]] | 673.667 | 22478848 | 115118080 | 91507200 | GPU_0_bfc | 92639232 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 3.67 | 0 | 192.00 | 9856.00 | 2.70 | 0.00 | 0.00 | true | 0.026510;0.026486;0.026609;0.026748;0.026484 | 0;0;0;0;0 | 192;192;192;192;192 | 8448;11008;10368;8192;10752 |
78 | InceptionV1/InceptionV1/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 45 | 6422528 | 0 | 80267776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.67 | 1605632 | 6424245.33 | 6404512.00 | 45.70 | 0.13 | 77.69 | true | 0.457631;0.458268;0.458485;0.455910;0.453906 | 1605632;1605632;1605632;1605632;1605632 | 6414624;6397472;6411424;6404640;6396192 | 6423648;6424448;6423712;6424576;6425088 |
79 | InceptionV1/InceptionV1/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 224 14 14]] | 78 | 22478848 | 0 | 80267776 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 60.33 | 5619712 | 22479936.00 | 22480906.67 | 47.90 | 0.12 | 93.14 | true | 0.478594;0.479670;0.479604;0.479182;0.478987 | 5619712;5619712;5619712;5619712;5619712 | 22479936;22479936;22479936;22479936;22479936 | 22476576;22476672;22477856;22488192;22488224 |
80 | InceptionV1/InceptionV1/Mixed_4c/concat | ConcatV2 | [[128 512 14 14]] | 200.333 | 51380224 | 51380224 | 131648000 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 37.36 | 0 | 11775344.00 | 11814522.67 | 94.70 | 0.00 | 0.00 | true | 0.952465;0.960803;0.938202;0.936663;0.954315;0.957969;0.939336;0.931718;0.952016;0.960793;0.941024;0.934877;0.959917;0.960240;0.936765;0.936833;0.955015;0.960514;0.928675;0.933500 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 16056480;22479008;6422688;6422624;16056480;22479008;6422688;6422688;16056480;22479008;6422688;6422624;16056480;22479008;6422688;6429280;16056480;22479008;6422688;6422624 | 16152032;22489952;6387808;6411360;16141024;22507904;6378592;6446944;16140032;22482272;6376672;6452192;16137216;22510816;6357472;6441568;16141920;22473824;6373760;6448352 |
80 | InceptionV1/InceptionV1/Mixed_4c/concat | ConcatV2 | [[128 512 14 14]] | 200.333 | 51380224 | 51380224 | 131648000 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 35.91 | 0 | 11775344.00 | 11814522.67 | 94.70 | 0.00 | 0.00 | true | 0.952465;0.960803;0.938202;0.936663;0.954315;0.957969;0.939336;0.931718;0.952016;0.960793;0.941024;0.934877;0.959917;0.960240;0.936765;0.936833;0.955015;0.960514;0.928675;0.933500 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 16056480;22479008;6422688;6422624;16056480;22479008;6422688;6422688;16056480;22479008;6422688;6422624;16056480;22479008;6422688;6429280;16056480;22479008;6422688;6422624 | 16152032;22489952;6387808;6411360;16141024;22507904;6378592;6446944;16140032;22482272;6376672;6452192;16137216;22510816;6357472;6441568;16141920;22473824;6373760;6448352 |
80 | InceptionV1/InceptionV1/Mixed_4c/concat | ConcatV2 | [[128 512 14 14]] | 200.333 | 51380224 | 51380224 | 131648000 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 33.55 | 0 | 11775344.00 | 11814522.67 | 94.70 | 0.00 | 0.00 | true | 0.952465;0.960803;0.938202;0.936663;0.954315;0.957969;0.939336;0.931718;0.952016;0.960793;0.941024;0.934877;0.959917;0.960240;0.936765;0.936833;0.955015;0.960514;0.928675;0.933500 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 16152032;22489952;6387808;6411360;16141024;22507904;6378592;6446944;16140032;22482272;6376672;6452192;16137216;22510816;6357472;6441568;16141920;22473824;6373760;6448352 | 16056480;22479008;6422688;6422624;16056480;22479008;6422688;6422688;16056480;22479008;6422688;6422624;16056480;22479008;6422688;6429280;16056480;22479008;6422688;6422624 |
80 | InceptionV1/InceptionV1/Mixed_4c/concat | ConcatV2 | [[128 512 14 14]] | 200.333 | 51380224 | 51380224 | 131648000 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 33.55 | 0 | 11775344.00 | 11814522.67 | 94.70 | 0.00 | 0.00 | true | 0.952465;0.960803;0.938202;0.936663;0.954315;0.957969;0.939336;0.931718;0.952016;0.960793;0.941024;0.934877;0.959917;0.960240;0.936765;0.936833;0.955015;0.960514;0.928675;0.933500 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 16152032;22489952;6387808;6411360;16141024;22507904;6378592;6446944;16140032;22482272;6376672;6452192;16137216;22510816;6357472;6441568;16141920;22473824;6373760;6448352 | 16056480;22479008;6422688;6422624;16056480;22479008;6422688;6422688;16056480;22479008;6422688;6422624;16056480;22479008;6422688;6429280;16056480;22479008;6422688;6422624 |
81 | InceptionV1/InceptionV1/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 512 14 14]] | 149.333 | 51380224 | 0 | 77859328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 129.33 | 0 | 51364682.67 | 51260565.33 | 96.30 | 0.00 | 0.00 | true | 0.963110;0.962369;0.963229;0.961953;0.963038 | 0;0;0;0;0 | 51365152;51363744;51365408;51365152;51362464 | 51289344;51248896;51259520;51265504;51256672 |
82 | InceptionV1/InceptionV1/Mixed_4d/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 512 14 14]] | 350.333 | 67436544 | 67436544 | 145295872 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 317.67 | 12845056 | 51383744.00 | 62230325.33 | 61.40 | 0.11 | 40.44 | true | 0.614427;0.614321;0.614385;0.614002;0.613978 | 12845056;12845056;12845056;12845056;12845056 | 62222592;62248864;62219520;62122944;62299552 | 51383744;51383744;51383744;51384000;51383744 |
83 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 147704320 | GPU_0_bfc | 50432 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 164.00 | 1647378432 | 52658378.67 | 3744714.67 | 15.80 | 29.21 | 10044.99 | false | 0.158293;0.158070;0.158229;0.158316;0.158178 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 53200224;52251136;52453024;52383008;53139104 | 3730880;3755616;3760064;3743456;3735072 |
83 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 147704320 | GPU_0_bfc | 50432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 49344.00 | 30922.67 | 45.30 | 0.00 | 0.00 | true | 0.455191;0.454180;0.447460;0.453229;0.450965 | 0;0;0;0;0 | 49344;49344;49344;49344;49344 | 28800;31616;32352;33696;25984 |
83 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 24 14 14]] | 262.333 | 2408448 | 2458880 | 147704320 | GPU_0_bfc | 50432 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 266.67 | 981.33 | 5.90 | 0.00 | 0.00 | true | 0.059456;0.059285;0.059502;0.059765;0.059469 | 0;0;0;0;0 | 608;96;96;96;2144 | 992;1344;576;608;1664 |
84 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 474.667 | 12845056 | 13108480 | 160549376 | GPU_0_bfc | 263424 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 397.33 | 3294756864 | 102809877.33 | 13898698.67 | 22.10 | 28.23 | 8292.18 | false | 0.222492;0.219663;0.222272;0.221596;0.216928 | 3294756864;3294756864;3294756864;3294756864;3294756864 | 101264576;103175104;103197184;102057344;103773504 | 13636704;13922272;14071584;13753600;14020224 |
84 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 474.667 | 12845056 | 13108480 | 160549376 | GPU_0_bfc | 263424 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 262336.00 | 10.67 | 45.00 | 0.00 | 0.00 | true | 0.451213;0.450167;0.447601;0.451360;0.447182 | 0;0;0;0;0 | 262336;262336;262336;262336;262336 | 32;32;0;0;0 |
84 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 474.667 | 12845056 | 13108480 | 160549376 | GPU_0_bfc | 263424 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 0.00 | 5.80 | 0.00 | 0.00 | true | 0.057989;0.057976;0.057980;0.057978;0.057976 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
85 | InceptionV1/InceptionV1/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 475.667 | 12845056 | 13108480 | 173394432 | GPU_0_bfc | 263424 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 397.33 | 3294756864 | 96821984.00 | 13674336.00 | 22.40 | 29.82 | 8292.18 | false | 0.224589;0.223072;0.220155;0.224958;0.223097 | 3294756864;3294756864;3294756864;3294756864;3294756864 | 100075680;87379808;101300480;101023648;89366624 | 14284928;12358048;14362368;14397280;12375712 |
85 | InceptionV1/InceptionV1/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 475.667 | 12845056 | 13108480 | 173394432 | GPU_0_bfc | 263424 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 262464.00 | 84256.00 | 44.60 | 0.00 | 0.00 | true | 0.446405;0.439420;0.451927;0.444222;0.446507 | 0;0;0;0;0 | 41408;142752;20768;151232;68608 | 262368;262336;262336;262688;264960 |
85 | InceptionV1/InceptionV1/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 475.667 | 12845056 | 13108480 | 173394432 | GPU_0_bfc | 263424 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 277.33 | 5.80 | 0.00 | 0.00 | true | 0.057969;0.058027;0.057954;0.058010;0.057973 | 0;0;0;0;0 | 352;96;96;96;96 | 640;128;160;544;128 |
86 | InceptionV1/InceptionV1/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 254.667 | 6422528 | 6554880 | 128436736 | GPU_0_bfc | 132352 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 163.33 | 1647378432 | 52674453.33 | 5567114.67 | 15.80 | 28.29 | 10086.01 | false | 0.158297;0.158148;0.158178;0.158063;0.158244 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 51782784;52608320;52918176;52613632;52801408 | 5613248;5544480;5543616;5710624;5513472 |
86 | InceptionV1/InceptionV1/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 254.667 | 6422528 | 6554880 | 128436736 | GPU_0_bfc | 132352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 131264.00 | 12181.33 | 44.40 | 0.00 | 0.00 | true | 0.435234;0.446981;0.444004;0.441721;0.450356 | 0;0;0;0;0 | 131264;131264;131296;131264;131264 | 7008;20768;14048;6496;15488 |
86 | InceptionV1/InceptionV1/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 254.667 | 6422528 | 6554880 | 128436736 | GPU_0_bfc | 132352 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 53.33 | 5.80 | 0.00 | 0.00 | true | 0.057964;0.057968;0.058020;0.057987;0.057975 | 0;0;0;0;0 | 96;96;608;96;96 | 0;160;768;0;0 |
87 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 24 14 14]] | 33 | 2408448 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 10.00 | 602112 | 2410442.67 | 2217834.67 | 44.70 | 0.13 | 60.21 | true | 0.446494;0.446366;0.449203;0.446085;0.447856 | 602112;602112;602112;602112;602112 | 2412384;2410464;2410528;2410336;2410272 | 2219520;2223840;2209344;2213376;2220608 |
88 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 14 14]] | 54 | 12845056 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 36.67 | 3211264 | 12847520.00 | 13106933.33 | 47.00 | 0.12 | 87.58 | true | 0.471280;0.469126;0.470112;0.468662;0.470251 | 3211264;3211264;3211264;3211264;3211264 | 12847584;12847424;12847552;12848192;12847168 | 13109952;13111040;13115264;13099808;13087552 |
89 | InceptionV1/InceptionV1/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 14 14]] | 53.667 | 12845056 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 35.67 | 3211264 | 12845824.00 | 12812874.67 | 46.90 | 0.13 | 90.03 | true | 0.469749;0.468445;0.470250;0.468559;0.470010 | 3211264;3211264;3211264;3211264;3211264 | 12845824;12845824;12845824;12845824;12845824 | 12808992;12820128;12809504;12802976;12843680 |
90 | InceptionV1/InceptionV1/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 38 | 6422528 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.00 | 1605632 | 6423040.00 | 6437066.67 | 45.10 | 0.12 | 80.28 | true | 0.450375;0.448486;0.452525;0.452182;0.450492 | 1605632;1605632;1605632;1605632;1605632 | 6424480;6438048;6430240;6462240;6442912 | 6423040;6423040;6423040;6423040;6423040 |
91 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 24 14 14]] | 25.667 | 2408448 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 9.00 | 0 | 2408544.00 | 2362016.00 | 67.40 | 0.00 | 0.00 | true | 0.669680;0.684736;0.675003;0.676105;0.669426 | 0;0;0;0;0 | 2385408;2359296;2391936;2341344;2329216 | 2408544;2408544;2408544;2408544;2408544 |
92 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 128 14 14]] | 52.333 | 12845056 | 0 | 61000192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 34.33 | 0 | 12845152.00 | 12792416.00 | 89.30 | 0.00 | 0.00 | true | 0.898140;0.881184;0.892160;0.892172;0.894476 | 0;0;0;0;0 | 12845152;12847968;12845152;12845152;12845152 | 12781408;12778656;12796384;12799456;12817120 |
93 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 158.333 | 6422528 | 6631680 | 67422720 | GPU_0_bfc | 209152 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 56.67 | 440401920 | 2881333.33 | 7663690.67 | 23.40 | 41.76 | 7771.75 | false | 0.234288;0.232750;0.236430;0.234703;0.233915 | 440401920;440401920;440401920;440401920;440401920 | 2877984;2874016;2874144;2891904;2891872 | 7681792;7646912;7645120;7698016;7662368 |
93 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 158.333 | 6422528 | 6631680 | 67422720 | GPU_0_bfc | 209152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 55488.00 | 137856.00 | 42.70 | 0.00 | 0.00 | true | 0.429951;0.426588;0.427292;0.423201;0.426545 | 0;0;0;0;0 | 55488;55488;55488;55488;55488 | 134656;148864;126976;142848;136064 |
93 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 158.333 | 6422528 | 6631680 | 67422720 | GPU_0_bfc | 209152 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 89088 | 1728.00 | 170272.00 | 6.20 | 0.52 | 17.82 | true | 0.062355;0.062367;0.062365;0.062345;0.062346 | 89088;89088;89088;89088;89088 | 1728;1728;1728;1728;1728 | 172576;170656;172832;166304;167584 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 411.67 | 4860149760 | 57081333.33 | 37303285.33 | 23.60 | 51.49 | 11806.02 | false | 0.234491;0.235416;0.236197;0.236136;0.236501 | 4860149760;4860149760;4860149760;4860149760;4860149760 | 55206944;58285664;56203744;56754592;59434208 | 36445472;38380448;35971040;37097280;38367104 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 101.33 | 151388160 | 37782581.33 | 28215274.67 | 69.20 | 2.29 | 1493.97 | true | 0.692401;0.687691;0.694538;0.692270;0.691861 | 151388160;151388160;151388160;151388160;151388160 | 28219712;28218016;28208096;28203168;28249824 | 37802752;37736256;37745344;37799648;37811008 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 56.67 | 0 | 11594.67 | 1060224.00 | 3.90 | 0.00 | 0.00 | true | 0.039487;0.039286;0.039334;0.039563;0.039277 | 0;0;0;0;0 | 11968;11744;11520;10176;11520 | 1055648;1079040;1064768;1045920;1060256 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 52.86 | 108331008 | 7025680.00 | 27991472.00 | 66.30 | 3.09 | 2049.51 | true | 0.671408;0.654777;0.671672;0.651026;0.672381;0.651296;0.669571;0.656996;0.670623;0.655945 | 144441344;72220672;144441344;72220672;144441344;72220672;144441344;72220672;144441344;72220672 | 685632;13308672;687584;13375168;686944;13467200;688992;13458496;689760;13403904 | 37054336;19009152;37029664;18862784;37014144;18922784;37012352;18990688;36999712;18886784 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 51.29 | 108331008 | 7025680.00 | 27991472.00 | 66.30 | 3.09 | 2112.29 | true | 0.671408;0.654777;0.671672;0.651026;0.672381;0.651296;0.669571;0.656996;0.670623;0.655945 | 144441344;72220672;144441344;72220672;144441344;72220672;144441344;72220672;144441344;72220672 | 685632;13308672;687584;13375168;686944;13467200;688992;13458496;689760;13403904 | 37054336;19009152;37029664;18862784;37014144;18922784;37012352;18990688;36999712;18886784 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1186197.33 | 1330496.00 | 44.40 | 0.00 | 0.00 | true | 0.444604;0.443686;0.443939;0.445874;0.443623 | 0;0;0;0;0 | 1186400;1186080;1185888;1188192;1186112 | 1323360;1341376;1335424;1332704;1314816 |
94 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 812.667 | 40943616 | 156549120 | 105957888 | GPU_0_bfc | 115605504 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 4.00 | 0 | 192.00 | 14506.67 | 2.60 | 0.00 | 0.00 | true | 0.026288;0.025685;0.025579;0.025600;0.025688 | 0;0;0;0;0 | 192;192;192;192;192 | 11904;14080;13184;24448;16256 |
95 | InceptionV1/InceptionV1/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 45 | 6422528 | 0 | 93112832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.67 | 1605632 | 6423434.67 | 6436085.33 | 45.50 | 0.12 | 77.69 | true | 0.458077;0.455614;0.454271;0.455273;0.451168 | 1605632;1605632;1605632;1605632;1605632 | 6434368;6443040;6436864;6432672;6437024 | 6423360;6423712;6423616;6423328;6423296 |
96 | InceptionV1/InceptionV1/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 256 14 14]] | 85.667 | 40943616 | 0 | 93112832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 68.33 | 6422528 | 25691349.33 | 25677344.00 | 48.20 | 0.13 | 93.99 | true | 0.482148;0.481475;0.481746;0.481853;0.482017 | 6422528;6422528;6422528;6422528;6422528 | 25691328;25691328;25691392;25691392;25691328 | 25694976;25661344;25671200;25670432;25690400 |
97 | InceptionV1/InceptionV1/Mixed_4d/concat | ConcatV2 | [[128 512 14 14]] | 199 | 67436544 | 67436544 | 160549376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 35.00 | 0 | 10704373.33 | 10734234.67 | 94.30 | 0.00 | 0.00 | true | 0.948102;0.967507;0.937284;0.931769;0.952570;0.964775;0.932433;0.935929;0.945434;0.961999;0.934676;0.928580;0.948976;0.962326;0.933296;0.929760;0.949384;0.963514;0.936859;0.925655 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690528;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688 | 12910944;25692768;6415744;6400128;12937728;25704800;6401632;6414592;12916960;25731840;6344800;6439168;12929888;25715168;6362080;6413952;12922848;25687904;6384384;6419456 |
97 | InceptionV1/InceptionV1/Mixed_4d/concat | ConcatV2 | [[128 512 14 14]] | 199 | 67436544 | 67436544 | 160549376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 32.09 | 0 | 10704373.33 | 10734234.67 | 94.30 | 0.00 | 0.00 | true | 0.948102;0.967507;0.937284;0.931769;0.952570;0.964775;0.932433;0.935929;0.945434;0.961999;0.934676;0.928580;0.948976;0.962326;0.933296;0.929760;0.949384;0.963514;0.936859;0.925655 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690528;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688 | 12910944;25692768;6415744;6400128;12937728;25704800;6401632;6414592;12916960;25731840;6344800;6439168;12929888;25715168;6362080;6413952;12922848;25687904;6384384;6419456 |
97 | InceptionV1/InceptionV1/Mixed_4d/concat | ConcatV2 | [[128 512 14 14]] | 199 | 67436544 | 67436544 | 160549376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 30.55 | 0 | 10704373.33 | 10734234.67 | 94.30 | 0.00 | 0.00 | true | 0.948102;0.967507;0.937284;0.931769;0.952570;0.964775;0.932433;0.935929;0.945434;0.961999;0.934676;0.928580;0.948976;0.962326;0.933296;0.929760;0.949384;0.963514;0.936859;0.925655 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690528;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688 | 12910944;25692768;6415744;6400128;12937728;25704800;6401632;6414592;12916960;25731840;6344800;6439168;12929888;25715168;6362080;6413952;12922848;25687904;6384384;6419456 |
97 | InceptionV1/InceptionV1/Mixed_4d/concat | ConcatV2 | [[128 512 14 14]] | 199 | 67436544 | 67436544 | 160549376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 30.55 | 0 | 10704373.33 | 10734234.67 | 94.30 | 0.00 | 0.00 | true | 0.948102;0.967507;0.937284;0.931769;0.952570;0.964775;0.932433;0.935929;0.945434;0.961999;0.934676;0.928580;0.948976;0.962326;0.933296;0.929760;0.949384;0.963514;0.936859;0.925655 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690528;6422688;6422688;12845216;25690272;6422688;6422688;12845216;25690272;6422688;6422688 | 12910944;25692768;6415744;6400128;12937728;25704800;6401632;6414592;12916960;25731840;6344800;6439168;12929888;25715168;6362080;6413952;12922848;25687904;6384384;6419456 |
98 | InceptionV1/InceptionV1/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 512 14 14]] | 148 | 67436544 | 0 | 93915648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 129.33 | 0 | 51380341.33 | 51293354.67 | 96.40 | 0.00 | 0.00 | true | 0.962716;0.965911;0.962961;0.964054;0.965068 | 0;0;0;0;0 | 51380384;51380384;51380128;51380384;51380256 | 51290848;51279712;51314688;51300032;51289184 |
99 | InceptionV1/InceptionV1/Mixed_4e/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 512 14 14]] | 350.667 | 51380224 | 51380224 | 145295872 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 317.67 | 12845056 | 51383232.00 | 62060586.67 | 61.40 | 0.11 | 40.44 | true | 0.614170;0.614123;0.613930;0.614212;0.613785 | 12845056;12845056;12845056;12845056;12845056 | 62027904;62070976;62011904;62082880;62107744 | 51383232;51383232;51383232;51383232;51383232 |
100 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 261.333 | 3211264 | 3278080 | 148507136 | GPU_0_bfc | 66816 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 163.67 | 1647378432 | 52813845.33 | 4119957.33 | 15.80 | 28.93 | 10065.43 | false | 0.158114;0.157824;0.158303;0.157988;0.157596 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 52502112;53312800;52786688;53152736;51932224 | 4108928;4123136;4139776;4110112;4126624 |
100 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 261.333 | 3211264 | 3278080 | 148507136 | GPU_0_bfc | 66816 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 65728.00 | 41333.33 | 44.00 | 0.00 | 0.00 | true | 0.440413;0.438563;0.441828;0.442084;0.431810 | 0;0;0;0;0 | 65728;65728;65728;70848;65728 | 40032;41984;42336;41792;40224 |
100 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 261.333 | 3211264 | 3278080 | 148507136 | GPU_0_bfc | 66816 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 650.67 | 5.90 | 0.00 | 0.00 | true | 0.059498;0.059484;0.059456;0.059468;0.059321 | 0;0;0;0;0 | 96;96;96;96;96 | 416;608;960;800;544 |
101 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 144 14 14]] | 527.667 | 14450688 | 14746880 | 162957824 | GPU_0_bfc | 296192 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 435.67 | 4942135296 | 128569184.00 | 12996512.00 | 23.70 | 34.91 | 11343.84 | false | 0.237975;0.238954;0.236487;0.236297;0.236318 | 4942135296;4942135296;4942135296;4942135296;4942135296 | 131806560;126544000;122200480;135346336;127356992 | 13545120;12586240;12543296;13579072;12858176 |
101 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 144 14 14]] | 527.667 | 14450688 | 14746880 | 162957824 | GPU_0_bfc | 296192 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 295104.00 | 10.67 | 46.00 | 0.00 | 0.00 | true | 0.456691;0.459406;0.461169;0.460375;0.460194 | 0;0;0;0;0 | 295104;295104;295104;295104;295104 | 128;0;0;0;32 |
101 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 144 14 14]] | 527.667 | 14450688 | 14746880 | 162957824 | GPU_0_bfc | 296192 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 0.00 | 5.80 | 0.00 | 0.00 | true | 0.057966;0.057969;0.057955;0.057948;0.057982 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
102 | InceptionV1/InceptionV1/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 511 | 11239424 | 11470080 | 174197248 | GPU_0_bfc | 230656 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 411.00 | 3294756864 | 102898112.00 | 13555072.00 | 22.70 | 28.29 | 8016.44 | false | 0.227392;0.227701;0.226997;0.229643;0.226035 | 3294756864;3294756864;3294756864;3294756864;3294756864 | 13508480;13619776;11690688;13633792;13536960 | 102900896;105117888;88258304;103986624;101806816 |
102 | InceptionV1/InceptionV1/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 511 | 11239424 | 11470080 | 174197248 | GPU_0_bfc | 230656 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 230709.33 | 120842.67 | 45.20 | 0.00 | 0.00 | true | 0.449555;0.452275;0.452413;0.455950;0.450732 | 0;0;0;0;0 | 230048;230144;230048;235072;231936 | 123488;129536;124640;108160;114400 |
102 | InceptionV1/InceptionV1/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 112 14 14]] | 511 | 11239424 | 11470080 | 174197248 | GPU_0_bfc | 230656 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 128.00 | 1237.33 | 5.80 | 0.00 | 0.00 | true | 0.057987;0.058070;0.057968;0.057971;0.057971 | 0;0;0;0;0 | 4192;160;96;96;128 | 2336;1504;1280;928;832 |
103 | InceptionV1/InceptionV1/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 257.333 | 6422528 | 6554880 | 113183232 | GPU_0_bfc | 132352 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 163.00 | 1647378432 | 52394581.33 | 5278336.00 | 15.80 | 28.56 | 10106.62 | false | 0.158449;0.158486;0.158445;0.158549;0.158437 | 1647378432;1647378432;1647378432;1647378432;1647378432 | 52051104;53036064;52851904;52183008;52148832 | 5384896;5254688;5296896;5240064;5283424 |
103 | InceptionV1/InceptionV1/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 257.333 | 6422528 | 6554880 | 113183232 | GPU_0_bfc | 132352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 131264.00 | 13898.67 | 44.00 | 0.00 | 0.00 | true | 0.453082;0.433576;0.444127;0.434552;0.442569 | 0;0;0;0;0 | 12384;11552;28544;9824;17760 | 131264;131264;131328;131264;131264 |
103 | InceptionV1/InceptionV1/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 257.333 | 6422528 | 6554880 | 113183232 | GPU_0_bfc | 132352 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 682.67 | 5.80 | 0.00 | 0.00 | true | 0.057980;0.057982;0.058003;0.057969;0.057968 | 0;0;0;0;0 | 96;96;96;96;96 | 1536;128;1152;384;512 |
104 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 32 14 14]] | 36.667 | 3211264 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 13.33 | 802816 | 3213525.33 | 3144810.67 | 44.20 | 0.13 | 60.21 | true | 0.449786;0.441509;0.440955;0.442681;0.440203 | 802816;802816;802816;802816;802816 | 3213344;3214336;3213760;3213248;3213472 | 3145888;3136480;3146528;3142016;3147264 |
105 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 144 14 14]] | 58.667 | 14450688 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 41.00 | 3612672 | 14453098.67 | 14606602.67 | 46.70 | 0.12 | 88.11 | true | 0.469181;0.467964;0.466864;0.465184;0.467443 | 3612672;3612672;3612672;3612672;3612672 | 14605792;14614336;14607040;14606048;14606720 | 14453184;14453536;14453344;14452576;14452768 |
106 | InceptionV1/InceptionV1/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 112 14 14]] | 50.667 | 11239424 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 33.33 | 2809856 | 11240128.00 | 11235872.00 | 45.90 | 0.13 | 84.30 | true | 0.460849;0.458928;0.458299;0.458747;0.459483 | 2809856;2809856;2809856;2809856;2809856 | 11240128;11240128;11240128;11240128;11240128 | 11231392;11245472;11236256;11218976;11239968 |
107 | InceptionV1/InceptionV1/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 37.667 | 6422528 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.00 | 1605632 | 6423040.00 | 6396736.00 | 45.20 | 0.13 | 80.28 | true | 0.452707;0.451825;0.453602;0.450696;0.450483 | 1605632;1605632;1605632;1605632;1605632 | 6423040;6423040;6423040;6423040;6423040 | 6410272;6391936;6395168;6403104;6388896 |
108 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 32 14 14]] | 29 | 3211264 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 11.00 | 0 | 3211360.00 | 2995733.33 | 73.70 | 0.00 | 0.00 | true | 0.736632;0.738669;0.729769;0.741504;0.734308 | 0;0;0;0;0 | 3211360;3211360;3211360;3211616;3211360 | 3044736;3063200;3014208;2928256;2907712 |
109 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 144 14 14]] | 54.667 | 14450688 | 0 | 61803008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 38.00 | 0 | 14450848.00 | 14568896.00 | 91.70 | 0.00 | 0.00 | true | 0.918366;0.916243;0.920756;0.912266;0.917526 | 0;0;0;0;0 | 14531904;14527488;14534848;14639936;14684704 | 14450848;14457504;14450848;14450848;14450848 |
110 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 171.333 | 6422528 | 6701312 | 68225536 | GPU_0_bfc | 278784 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 69.00 | 579862528 | 3967850.67 | 7700736.00 | 23.30 | 49.69 | 8403.80 | false | 0.233935;0.234920;0.231906;0.233190;0.232424 | 579862528;579862528;579862528;579862528;579862528 | 3999296;3962720;3946176;3969952;3970880 | 7692512;7663104;7713632;7701408;7708288 |
110 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 171.333 | 6422528 | 6701312 | 68225536 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 73920.00 | 154709.33 | 43.00 | 0.00 | 0.00 | true | 0.427940;0.430080;0.430347;0.430639;0.430974 | 0;0;0;0;0 | 73920;79296;73920;73920;73920 | 146304;166528;162048;153088;148992 |
110 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 64 14 14]] | 171.333 | 6422528 | 6701312 | 68225536 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 118784 | 1728.00 | 184746.67 | 6.20 | 0.64 | 23.76 | true | 0.062348;0.062362;0.062351;0.062349;0.062347 | 118784;118784;118784;118784;118784 | 1728;1728;1728;1728;1728 | 184448;170880;190720;187648;182144 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 508.67 | 6147145728 | 37136832.00 | 21556448.00 | 23.60 | 104.73 | 12084.81 | false | 0.236310;0.235974;0.234881;0.236524;0.236000 | 6147145728;6147145728;6147145728;6147145728;6147145728 | 38489792;36518464;37151936;37740096;35343744 | 22890240;21605184;21547904;21516256;20881728 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 110.00 | 170311680 | 42697888.00 | 31708970.67 | 70.00 | 2.29 | 1548.29 | true | 0.700034;0.699401;0.701080;0.699441;0.697775 | 170311680;170311680;170311680;170311680;170311680 | 42740608;42684640;42689856;42719168;42655264 | 31717728;31672384;31703808;31794848;31705376 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 71.00 | 0 | 18357.33 | 1385568.00 | 3.90 | 0.00 | 0.00 | true | 0.039076;0.039927;0.039191;0.039164;0.039127 | 0;0;0;0;0 | 18976;16192;18528;22080;17568 | 1375296;1410752;1379616;1386848;1390240 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 64.29 | 132028416 | 8059189.33 | 34211061.33 | 67.20 | 3.12 | 2053.77 | true | 0.683919;0.661305;0.684322;0.653561;0.683686;0.657778;0.684183;0.653605;0.683958;0.661504 | 182808576;81248256;182808576;81248256;182808576;81248256;182808576;81248256;182808576;81248256 | 47147328;21280960;47121888;21245792;47144640;21291488;47141184;21288320;47142528;21274528 | 1006208;15304704;1007104;15108288;1011264;15163328;1000352;15058944;999808;15184704 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 61.86 | 132028416 | 8059189.33 | 34211061.33 | 67.20 | 3.12 | 2134.41 | true | 0.683919;0.661305;0.684322;0.653561;0.683686;0.657778;0.684183;0.653605;0.683958;0.661504 | 182808576;81248256;182808576;81248256;182808576;81248256;182808576;81248256;182808576;81248256 | 1006208;15304704;1007104;15108288;1011264;15163328;1000352;15058944;999808;15184704 | 47147328;21280960;47121888;21245792;47144640;21291488;47141184;21288320;47142528;21274528 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1511157.33 | 1571317.33 | 45.10 | 0.00 | 0.00 | true | 0.453794;0.448532;0.447588;0.450576;0.455729 | 0;0;0;0;0 | 1580544;1569472;1584832;1563936;1544096 | 1511936;1509472;1511424;1511968;1510112 |
111 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 288 14 14]] | 954.667 | 28901376 | 169906176 | 93915648 | GPU_0_bfc | 141004800 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 3.67 | 0 | 192.00 | 13568.00 | 2.60 | 0.00 | 0.00 | true | 0.025578;0.025578;0.025586;0.025581;0.025618 | 0;0;0;0;0 | 192;192;192;192;192 | 13440;13952;28672;12160;13312 |
112 | InceptionV1/InceptionV1/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 64 14 14]] | 46.667 | 6422528 | 0 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.33 | 1605632 | 6423712.00 | 6396992.00 | 45.50 | 0.13 | 78.97 | true | 0.456964;0.456112;0.455264;0.454289;0.449577 | 1605632;1605632;1605632;1605632;1605632 | 6400128;6393728;6386944;6397120;6428704 | 6423840;6423744;6424640;6423328;6423552 |
113 | InceptionV1/InceptionV1/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 288 14 14]] | 94.667 | 28901376 | 0 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 76.00 | 7225344 | 28902762.67 | 28918944.00 | 47.80 | 0.12 | 95.07 | true | 0.477594;0.477906;0.478046;0.476068;0.477818 | 7225344;7225344;7225344;7225344;7225344 | 28902784;28902720;28902720;28902784;28902784 | 28900512;28928544;28933024;28921632;28906656 |
114 | InceptionV1/InceptionV1/Mixed_4e/concat | ConcatV2 | [[128 528 14 14]] | 204 | 94732288 | 94732288 | 174197248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 35.45 | 0 | 10303413.33 | 10326293.33 | 94.30 | 0.00 | 0.00 | true | 0.947337;0.967538;0.933272;0.934414;0.947694;0.968859;0.933154;0.933257;0.954140;0.966905;0.925960;0.933013;0.950797;0.967224;0.939288;0.932882;0.945209;0.969571;0.931613;0.927717 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6428064 | 11333120;28915200;6361056;6405216;11299840;28923488;6387296;6402560;11292896;28946400;6355040;6425056;11320928;28925280;6386784;6392160;11309280;28919040;6359264;6431968 |
114 | InceptionV1/InceptionV1/Mixed_4e/concat | ConcatV2 | [[128 528 14 14]] | 204 | 94732288 | 94732288 | 174197248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 31.45 | 0 | 10303413.33 | 10326293.33 | 94.30 | 0.00 | 0.00 | true | 0.947337;0.967538;0.933272;0.934414;0.947694;0.968859;0.933154;0.933257;0.954140;0.966905;0.925960;0.933013;0.950797;0.967224;0.939288;0.932882;0.945209;0.969571;0.931613;0.927717 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6428064 | 11333120;28915200;6361056;6405216;11299840;28923488;6387296;6402560;11292896;28946400;6355040;6425056;11320928;28925280;6386784;6392160;11309280;28919040;6359264;6431968 |
114 | InceptionV1/InceptionV1/Mixed_4e/concat | ConcatV2 | [[128 528 14 14]] | 204 | 94732288 | 94732288 | 174197248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 30.27 | 0 | 10303413.33 | 10326293.33 | 94.30 | 0.00 | 0.00 | true | 0.947337;0.967538;0.933272;0.934414;0.947694;0.968859;0.933154;0.933257;0.954140;0.966905;0.925960;0.933013;0.950797;0.967224;0.939288;0.932882;0.945209;0.969571;0.931613;0.927717 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6428064 | 11333120;28915200;6361056;6405216;11299840;28923488;6387296;6402560;11292896;28946400;6355040;6425056;11320928;28925280;6386784;6392160;11309280;28919040;6359264;6431968 |
114 | InceptionV1/InceptionV1/Mixed_4e/concat | ConcatV2 | [[128 528 14 14]] | 204 | 94732288 | 94732288 | 174197248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 30.27 | 0 | 10303413.33 | 10326293.33 | 94.30 | 0.00 | 0.00 | true | 0.947337;0.967538;0.933272;0.934414;0.947694;0.968859;0.933154;0.933257;0.954140;0.966905;0.925960;0.933013;0.950797;0.967224;0.939288;0.932882;0.945209;0.969571;0.931613;0.927717 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 11333120;28915200;6361056;6405216;11299840;28923488;6387296;6402560;11292896;28946400;6355040;6425056;11320928;28925280;6386784;6392160;11309280;28919040;6359264;6431968 | 11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6422688;11239584;28901536;6422688;6428064 |
115 | InceptionV1/InceptionV1/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 528 14 14]] | 153.667 | 94732288 | 0 | 121211392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 133.00 | 0 | 52985973.33 | 52912938.67 | 96.30 | 0.00 | 0.00 | true | 0.964000;0.965934;0.963344;0.961847;0.962485 | 0;0;0;0;0 | 52916096;52924256;52915200;52907520;52883456 | 52985952;52988000;52985952;52985952;52986016 |
116 | InceptionV1/InceptionV1/Mixed_4f/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 528 14 14]] | 361 | 52985856 | 52985856 | 174197248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 326.00 | 13246464 | 52988864.00 | 64136384.00 | 61.40 | 0.11 | 40.63 | true | 0.614092;0.614259;0.614352;0.614009;0.614225 | 13246464;13246464;13246464;13246464;13246464 | 52988864;52988864;52988864;52988864;52988864 | 64159520;64113472;64136160;64092288;64207200 |
117 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 269.667 | 3211264 | 3280128 | 177408512 | GPU_0_bfc | 68864 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 168.00 | 1698758656 | 54415477.33 | 4105418.67 | 15.80 | 29.03 | 10111.66 | false | 0.158403;0.158251;0.158343;0.158421;0.158362 | 1698758656;1698758656;1698758656;1698758656;1698758656 | 54607936;54753504;54784224;53781952;53884992 | 4111136;4096160;4090848;4133440;4108960 |
117 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 269.667 | 3211264 | 3280128 | 177408512 | GPU_0_bfc | 68864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 67776.00 | 65536.00 | 43.00 | 0.00 | 0.00 | true | 0.430544;0.427316;0.429406;0.428753;0.433783 | 0;0;0;0;0 | 68960;65216;62368;65312;66080 | 67776;67776;67776;67776;67776 |
117 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 14 14]] | 269.667 | 3211264 | 3280128 | 177408512 | GPU_0_bfc | 68864 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 725.33 | 5.90 | 0.00 | 0.00 | true | 0.059449;0.059468;0.059452;0.059456;0.059616 | 0;0;0;0;0 | 544;672;928;864;640 | 96;96;96;96;96 |
118 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 510.667 | 16056320 | 16395520 | 193464832 | GPU_0_bfc | 339200 | 0 | 0 | 0 | volta_scudnn_128x32_relu_interior_nn_v1 | 402.33 | 4246896640 | 173895904.00 | 17446709.33 | 20.50 | 22.20 | 10555.68 | false | 0.204198;0.203784;0.206040;0.205303;0.204402 | 4246896640;4246896640;4246896640;4246896640;4246896640 | 17426240;17565440;17637824;16977760;17348448 | 177458624;181479008;172450048;168681440;171779040 |
118 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 510.667 | 16056320 | 16395520 | 193464832 | GPU_0_bfc | 339200 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 338112.00 | 0.00 | 46.10 | 0.00 | 0.00 | true | 0.462584;0.460795;0.460692;0.460288;0.459606 | 0;0;0;0;0 | 0;0;0;0;0 | 338112;338112;338112;338112;338112 |
118 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 14 14]] | 510.667 | 16056320 | 16395520 | 193464832 | GPU_0_bfc | 339200 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 0.00 | 5.80 | 0.00 | 0.00 | true | 0.058324;0.057969;0.057959;0.057982;0.057975 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
119 | InceptionV1/InceptionV1/Mixed_4f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 769.333 | 25690112 | 26232064 | 219154944 | GPU_0_bfc | 541952 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 668.33 | 6795034624 | 21.33 | 2197.33 | 23.20 | 3062666.77 | 10167.14 | false | 0.230036;0.227970;0.234722;0.236253;0.231345 | 6795034624;6795034624;6795034624;6795034624;6795034624 | 192;64;0;0;0 | 1824;3072;1824;2144;2624 |
119 | InceptionV1/InceptionV1/Mixed_4f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 769.333 | 25690112 | 26232064 | 219154944 | GPU_0_bfc | 541952 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 542752.00 | 274250.67 | 45.90 | 0.00 | 0.00 | true | 0.455800;0.457997;0.459732;0.459458;0.458267 | 0;0;0;0;0 | 542656;543360;542240;547904;542208 | 323168;238688;197248;260896;337664 |
119 | InceptionV1/InceptionV1/Mixed_4f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 14 14]] | 769.333 | 25690112 | 26232064 | 219154944 | GPU_0_bfc | 541952 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 117.33 | 1205.33 | 5.80 | 0.00 | 0.00 | true | 0.057976;0.057994;0.057989;0.057990;0.057963 | 0;0;0;0;0 | 128;128;96;96;128 | 4672;2080;672;864;640 |
120 | InceptionV1/InceptionV1/Mixed_4f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 469.333 | 12845056 | 13116672 | 137267712 | GPU_0_bfc | 271616 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 339.67 | 3397517312 | 106045909.33 | 14557002.67 | 21.70 | 28.17 | 10002.49 | false | 0.217116;0.217279;0.219607;0.216231;0.216341 | 3397517312;3397517312;3397517312;3397517312;3397517312 | 105811040;106210592;106116096;105028576;106400544 | 14719104;14678496;14528896;14463616;14141184 |
120 | InceptionV1/InceptionV1/Mixed_4f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 469.333 | 12845056 | 13116672 | 137267712 | GPU_0_bfc | 271616 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 272736.00 | 128416.00 | 45.00 | 0.00 | 0.00 | true | 0.448621;0.449290;0.449650;0.451706;0.450303 | 0;0;0;0;0 | 272224;273440;272544;271040;273536 | 124800;152160;108288;52160;192800 |
120 | InceptionV1/InceptionV1/Mixed_4f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 469.333 | 12845056 | 13116672 | 137267712 | GPU_0_bfc | 271616 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 672.00 | 5.80 | 0.00 | 0.00 | true | 0.057966;0.057989;0.057978;0.057985;0.057968 | 0;0;0;0;0 | 96;96;96;96;96 | 576;384;704;3168;736 |
121 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 32 14 14]] | 36.333 | 3211264 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 11.33 | 802816 | 3213013.33 | 1947456.00 | 44.30 | 0.16 | 70.84 | true | 0.442478;0.441101;0.443880;0.441658;0.448865 | 802816;802816;802816;802816;802816 | 3213248;3212928;3212448;3212928;3213184 | 1828320;1971936;1979392;1923872;1946560 |
122 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 160 14 14]] | 64.333 | 16056320 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 46.00 | 4014080 | 16060074.67 | 16183285.33 | 47.10 | 0.12 | 87.26 | true | 0.471566;0.470129;0.471539;0.469275;0.470055 | 4014080;4014080;4014080;4014080;4014080 | 16061344;16058304;16059616;16059264;16061472 | 16168704;16184256;16209344;16192160;16173440 |
123 | InceptionV1/InceptionV1/Mixed_4f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 256 14 14]] | 85.667 | 25690112 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 68.33 | 6422528 | 25691392.00 | 25688266.67 | 48.20 | 0.13 | 93.99 | true | 0.481882;0.482045;0.481813;0.481642;0.481215 | 6422528;6422528;6422528;6422528;6422528 | 25691392;25691392;25691392;25691392;25693184 | 25701760;25690272;25672352;25677856;25696672 |
124 | InceptionV1/InceptionV1/Mixed_4f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 14 14]] | 55.333 | 12845056 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 36.00 | 3211264 | 12845824.00 | 12834112.00 | 47.00 | 0.13 | 89.20 | true | 0.471342;0.469593;0.471027;0.468579;0.470040 | 3211264;3211264;3211264;3211264;3211264 | 12845824;12845824;12845824;12845824;12845824 | 12838656;12833568;12834336;12833440;12834432 |
125 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 32 14 14]] | 29 | 3211264 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 10.67 | 0 | 3211424.00 | 2921920.00 | 75.60 | 0.00 | 0.00 | true | 0.752918;0.751752;0.772138;0.755593;0.759734 | 0;0;0;0;0 | 3211424;3211424;3211424;3211424;3211424 | 2894784;2998624;2897184;2937856;2930720 |
126 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 160 14 14]] | 59.333 | 16056320 | 0 | 84281856 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 43.00 | 0 | 16056522.67 | 16269056.00 | 90.60 | 0.00 | 0.00 | true | 0.901690;0.914506;0.906267;0.909241;0.902100 | 0;0;0;0;0 | 16056416;16056416;16063072;16056736;16056416 | 16327136;16206240;16289184;16248576;16269408 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 65.00 | 618135552 | 7512885.33 | 19521482.67 | 22.60 | 22.86 | 9509.78 | false | 0.226134;0.227421;0.226088;0.225498;0.225436 | 618135552;618135552;618135552;618135552;618135552 | 7534880;7527712;7498976;7498464;7511968 | 19526688;19534656;19503104;19458400;19550624 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 58.00 | 75694080 | 18073418.67 | 14357952.00 | 66.40 | 2.33 | 1305.07 | true | 0.662523;0.664805;0.661336;0.664070;0.668185 | 75694080;75694080;75694080;75694080;75694080 | 18071520;18076416;18072320;18082400;18062272 | 14357792;14336544;14360288;14366592;14355776 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 14.57 | 18055168 | 1607392.00 | 4201802.67 | 32.10 | 3.11 | 1239.12 | true | 0.302055;0.341096;0.306293;0.338718;0.302259;0.337707;0.302320;0.337784;0.303948;0.338508 | 18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168 | 3996256;4400064;3983680;4421728;3973792;4452576;3973664;4490208;3950240;4435296 | 2880;3212672;2816;3211968;3072;3211904;2816;3211712;2816;3218944 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 14.00 | 18055168 | 1607392.00 | 4201802.67 | 32.10 | 3.11 | 1289.65 | true | 0.302055;0.341096;0.306293;0.338718;0.302259;0.337707;0.302320;0.337784;0.303948;0.338508 | 18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168;18055168 | 2880;3212672;2816;3211968;3072;3211904;2816;3211712;2816;3218944 | 3996256;4400064;3983680;4421728;3973792;4452576;3973664;4490208;3950240;4435296 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 10.00 | 0 | 1216.00 | 420522.67 | 4.30 | 0.00 | 0.00 | true | 0.042573;0.043006;0.042672;0.042960;0.043334 | 0;0;0;0;0 | 1216;1216;1216;1216;1216 | 393600;420096;422016;422144;419456 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 147648.00 | 327370.67 | 43.00 | 0.00 | 0.00 | true | 0.429173;0.430015;0.430391;0.430910;0.429933 | 0;0;0;0;0 | 148160;147648;147648;147648;147648 | 320672;324128;317216;339872;337312 |
127 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 14 14]] | 290 | 12845056 | 50888704 | 97126912 | GPU_0_bfc | 38043648 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 3.67 | 0 | 192.00 | 16000.00 | 2.60 | 0.00 | 0.00 | true | 0.026176;0.025636;0.025574;0.025570;0.025590 | 0;0;0;0;0 | 192;192;192;3520;192 | 14720;16768;16512;24320;13184 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | volta_cgemm_32x32_tn | 617.67 | 7585136640 | 0.00 | 1184.00 | 23.70 | 6406365.41 | 12280.30 | false | 0.237639;0.236667;0.237222;0.237333;0.237008 | 7585136640;7585136640;7585136640;7585136640;7585136640 | 1336704;0;0;0;0 | 1440;1184;1184;928;1184 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 124.00 | 189235200 | 47501706.67 | 35232021.33 | 70.40 | 2.29 | 1526.09 | true | 0.703682;0.704945;0.702499;0.704313;0.703385 | 189235200;189235200;189235200;189235200;189235200 | 35252832;35201664;35210560;35232672;35267520 | 47531360;47498080;47521632;47485408;47480096 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | void flip_filter<float, float>(float*, float const*, int, int, int, int) | 87.00 | 0 | 233930.67 | 1763381.33 | 4.20 | 0.00 | 0.00 | true | 0.041577;0.041523;0.041427;0.041497;0.041625 | 0;0;0;0;0 | 233888;235744;234720;232480;233184 | 1755808;1768320;1760672;1765856;1763616 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 76.29 | 157982720 | 9086634.67 | 41028202.67 | 68.10 | 3.15 | 2070.93 | true | 0.694676;0.666982;0.693953;0.667850;0.694043;0.670292;0.692836;0.663475;0.692530;0.666597 | 225689600;90275840;225689600;90275840;225689600;90275840;225689600;90275840;225689600;90275840 | 1365376;17070464;1358208;16842240;1360256;16817408;1366400;16806528;1362880;16801216 | 58477952;23585248;58430016;23574496;58433248;23607392;58456448;23566048;58421056;23692256 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 71.43 | 157982720 | 9086634.67 | 41028202.67 | 68.10 | 3.15 | 2211.74 | true | 0.694676;0.666982;0.693953;0.667850;0.694043;0.670292;0.692836;0.663475;0.692530;0.666597 | 225689600;90275840;225689600;90275840;225689600;90275840;225689600;90275840;225689600;90275840 | 1365376;17070464;1358208;16842240;1360256;16817408;1366400;16806528;1362880;16801216 | 58477952;23585248;58430016;23574496;58433248;23607392;58456448;23566048;58421056;23692256 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.67 | 0 | 1843893.33 | 2367029.33 | 45.10 | 0.00 | 0.00 | true | 0.448910;0.453812;0.453661;0.450225;0.449688 | 0;0;0;0;0 | 2366944;2372736;2378144;2358176;2361408 | 1843872;1843776;1843680;1844032;1844032 |
128 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 14 14]] | 1115.333 | 32112640 | 202317824 | 126028288 | GPU_0_bfc | 170205184 | 0 | 0 | 0 | compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 3.33 | 0 | 192.00 | 12192.00 | 2.60 | 0.00 | 0.00 | true | 0.026043;0.025576;0.026096;0.025963;0.025608 | 0;0;0;0;0 | 192;192;192;192;192 | 12032;12672;11776;12416;12128 |
129 | InceptionV1/InceptionV1/Mixed_4f/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 128 14 14]] | 60.667 | 12845056 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 36.00 | 3211264 | 12846016.00 | 12817653.33 | 47.00 | 0.13 | 89.20 | true | 0.471559;0.470917;0.469733;0.469715;0.470544 | 3211264;3211264;3211264;3211264;3211264 | 12811808;12821312;12804256;12837664;12819840 | 12846432;12845952;12846016;12846080;12845856 |
130 | InceptionV1/InceptionV1/Mixed_4f/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 320 14 14]] | 102 | 32112640 | 0 | 109971968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 84.67 | 8028160 | 32114176.00 | 32143605.33 | 48.20 | 0.12 | 94.82 | true | 0.482864;0.480550;0.481053;0.481876;0.482253 | 8028160;8028160;8028160;8028160;8028160 | 32114176;32114176;32114176;32114176;32114176 | 32134944;32147104;32151584;32124800;32148768 |
131 | InceptionV1/InceptionV1/Mixed_4f/concat | ConcatV2 | [[128 832 14 14]] | 284.667 | 112394240 | 112394240 | 222366208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 57.36 | 0 | 19803104.00 | 19851757.33 | 95.80 | 0.00 | 0.00 | true | 0.964009;0.970047;0.948230;0.954254;0.964904;0.972766;0.950064;0.950715;0.963326;0.971282;0.948251;0.948413;0.962665;0.970882;0.944827;0.948110;0.965158;0.970512;0.948581;0.947110 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25690272;32112800;12845216;12845216;25690272;32113056;12845216;12845216;25690272;32112800;12845216;12845216;25690272;32112800;12845216;12845216;25692064;32112800;12845216;12845216 | 25767040;32122752;12804576;12898016;25765216;32121952;12787712;12888192;25802848;32094208;12821728;12860416;25781600;32124256;12816128;12889856;25808224;32066784;12840928;12851968 |
131 | InceptionV1/InceptionV1/Mixed_4f/concat | ConcatV2 | [[128 832 14 14]] | 284.667 | 112394240 | 112394240 | 222366208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 56.27 | 0 | 19803104.00 | 19851757.33 | 95.80 | 0.00 | 0.00 | true | 0.964009;0.970047;0.948230;0.954254;0.964904;0.972766;0.950064;0.950715;0.963326;0.971282;0.948251;0.948413;0.962665;0.970882;0.944827;0.948110;0.965158;0.970512;0.948581;0.947110 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25690272;32112800;12845216;12845216;25690272;32113056;12845216;12845216;25690272;32112800;12845216;12845216;25690272;32112800;12845216;12845216;25692064;32112800;12845216;12845216 | 25767040;32122752;12804576;12898016;25765216;32121952;12787712;12888192;25802848;32094208;12821728;12860416;25781600;32124256;12816128;12889856;25808224;32066784;12840928;12851968 |
131 | InceptionV1/InceptionV1/Mixed_4f/concat | ConcatV2 | [[128 832 14 14]] | 284.667 | 112394240 | 112394240 | 222366208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 53.09 | 0 | 19803104.00 | 19851757.33 | 95.80 | 0.00 | 0.00 | true | 0.964009;0.970047;0.948230;0.954254;0.964904;0.972766;0.950064;0.950715;0.963326;0.971282;0.948251;0.948413;0.962665;0.970882;0.944827;0.948110;0.965158;0.970512;0.948581;0.947110 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25690272;32112800;12845216;12845216;25690272;32113056;12845216;12845216;25690272;32112800;12845216;12845216;25690272;32112800;12845216;12845216;25692064;32112800;12845216;12845216 | 25767040;32122752;12804576;12898016;25765216;32121952;12787712;12888192;25802848;32094208;12821728;12860416;25781600;32124256;12816128;12889856;25808224;32066784;12840928;12851968 |
131 | InceptionV1/InceptionV1/Mixed_4f/concat | ConcatV2 | [[128 832 14 14]] | 284.667 | 112394240 | 112394240 | 222366208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 53.00 | 0 | 19803104.00 | 19851757.33 | 95.80 | 0.00 | 0.00 | true | 0.964009;0.970047;0.948230;0.954254;0.964904;0.972766;0.950064;0.950715;0.963326;0.971282;0.948251;0.948413;0.962665;0.970882;0.944827;0.948110;0.965158;0.970512;0.948581;0.947110 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 25690272;32112800;12845216;12845216;25690272;32113056;12845216;12845216;25690272;32112800;12845216;12845216;25690272;32112800;12845216;12845216;25692064;32112800;12845216;12845216 | 25767040;32122752;12804576;12898016;25765216;32121952;12787712;12888192;25802848;32094208;12821728;12860416;25781600;32124256;12816128;12889856;25808224;32066784;12840928;12851968 |
132 | InceptionV1/InceptionV1/Mixed_4f/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 832 14 14]] | 227.333 | 112394240 | 0 | 138873344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 207.33 | 0 | 83493024.00 | 83369162.67 | 97.10 | 0.00 | 0.00 | true | 0.969016;0.970986;0.971807;0.971663;0.972701 | 0;0;0;0;0 | 83493024;83493024;83493024;83497120;83493024 | 83364192;83374208;83390720;83364960;83368320 |
133 | InceptionV1/InceptionV1/MaxPool_5a_2x2/MaxPool | MaxPool | [[128 832 7 7]] | 186 | 20873216 | 20873216 | 159746560 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 151.67 | 5218304 | 83505162.67 | 25816533.33 | 63.10 | 0.05 | 34.41 | true | 0.630994;0.631022;0.630183;0.631560;0.631142 | 5218304;5218304;5218304;5218304;5218304 | 25783168;25758624;25704032;25952576;25907808 | 83511808;83498816;83501984;83505568;83507936 |
134 | InceptionV1/InceptionV1/Mixed_5b/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 832 7 7]] | 145.667 | 20873216 | 20873216 | 68225536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 116.33 | 5218304 | 20873408.00 | 22559584.00 | 65.40 | 0.12 | 44.86 | true | 0.654367;0.654057;0.654125;0.653953;0.654085 | 5218304;5218304;5218304;5218304;5218304 | 20873408;20873408;20878528;20873408;20873408 | 22618720;22573664;22625536;22486368;22424704 |
135 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 7 7]] | 204.333 | 802816 | 909824 | 69028352 | GPU_0_bfc | 107008 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 106.00 | 668745728 | 20645472.00 | 2935178.67 | 6.20 | 28.36 | 6308.92 | false | 0.062496;0.062496;0.062496;0.062496;0.062496 | 668745728;668745728;668745728;668745728;668745728 | 2932064;2939968;2941312;2925984;2933504 | 20647104;20643712;20645056;20652672;20644256 |
135 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 7 7]] | 204.333 | 802816 | 909824 | 69028352 | GPU_0_bfc | 107008 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 106688.00 | 76960.00 | 43.20 | 0.00 | 0.00 | true | 0.432606;0.431494;0.431675;0.431116;0.432221 | 0;0;0;0;0 | 111808;106688;106688;106688;106688 | 82176;76224;76576;76480;77824 |
135 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 32 7 7]] | 204.333 | 802816 | 909824 | 69028352 | GPU_0_bfc | 107008 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 778.67 | 4.90 | 0.00 | 0.00 | true | 0.048855;0.048841;0.048885;0.048831;0.048808 | 0;0;0;0;0 | 352;352;352;352;352 | 928;736;544;672;1120 |
136 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 7 7]] | 273.333 | 4014080 | 4547072 | 73042432 | GPU_0_bfc | 532992 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 177.33 | 2006237184 | 24404672.00 | 3077621.33 | 11.70 | 73.00 | 11313.39 | false | 0.117438;0.116739;0.116497;0.117109;0.117093 | 2006237184;2006237184;2006237184;2006237184;2006237184 | 3070368;3150496;3119744;3042752;3033344 | 24444928;24555296;24386624;24358336;24382464 |
136 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 7 7]] | 273.333 | 4014080 | 4547072 | 73042432 | GPU_0_bfc | 532992 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 532672.00 | 0.00 | 45.40 | 0.00 | 0.00 | true | 0.454485;0.454849;0.454832;0.453715;0.453639 | 0;0;0;0;0 | 534720;532672;532672;532672;532672 | 0;0;0;0;0 |
136 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 160 7 7]] | 273.333 | 4014080 | 4547072 | 73042432 | GPU_0_bfc | 532992 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 0.00 | 4.90 | 0.00 | 0.00 | true | 0.048828;0.048821;0.048810;0.048818;0.048831 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
137 | InceptionV1/InceptionV1/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 7 7]] | 358.333 | 6422528 | 7275008 | 79464960 | GPU_0_bfc | 852480 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 261.00 | 2674982912 | 41546602.67 | 7902730.67 | 15.40 | 54.10 | 10248.98 | false | 0.153673;0.153721;0.154405;0.154202;0.153906 | 2674982912;2674982912;2674982912;2674982912;2674982912 | 41493344;41442720;41795488;41539840;41606624 | 7873888;7883488;7931168;7983008;7893536 |
137 | InceptionV1/InceptionV1/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 7 7]] | 358.333 | 6422528 | 7275008 | 79464960 | GPU_0_bfc | 852480 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.33 | 0 | 852181.33 | 938432.00 | 44.90 | 0.00 | 0.00 | true | 0.448947;0.449220;0.447886;0.451568;0.447968 | 0;0;0;0;0 | 852160;852160;852192;852192;852192 | 946688;919840;907232;948768;963040 |
137 | InceptionV1/InceptionV1/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 256 7 7]] | 358.333 | 6422528 | 7275008 | 79464960 | GPU_0_bfc | 852480 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 885.33 | 4.90 | 0.00 | 0.00 | true | 0.048862;0.048841;0.048818;0.048818;0.048862 | 0;0;0;0;0 | 96;96;96;96;96 | 864;832;960;640;992 |
138 | InceptionV1/InceptionV1/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274 | 3211264 | 3637760 | 61803008 | GPU_0_bfc | 426496 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 178.00 | 1337491456 | 31138026.67 | 5293685.33 | 8.20 | 36.71 | 7514.00 | false | 0.082019;0.081878;0.081848;0.081869;0.081872 | 1337491456;1337491456;1337491456;1337491456;1337491456 | 31072448;31131744;31203968;31129216;31153120 | 5350080;5264608;5286848;5244384;5329600 |
138 | InceptionV1/InceptionV1/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274 | 3211264 | 3637760 | 61803008 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 426464.00 | 314464.00 | 44.90 | 0.00 | 0.00 | true | 0.451564;0.449155;0.449966;0.448876;0.448679 | 0;0;0;0;0 | 426272;431648;426336;426400;426656 | 279424;383776;290816;343808;308768 |
138 | InceptionV1/InceptionV1/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274 | 3211264 | 3637760 | 61803008 | GPU_0_bfc | 426496 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 864.00 | 4.90 | 0.00 | 0.00 | true | 0.048885;0.048841;0.048834;0.048852;0.048818 | 0;0;0;0;0 | 96;96;96;96;96 | 928;1248;864;800;704 |
139 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 32 7 7]] | 30.667 | 802816 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 200704 | 803904.00 | 75061.33 | 45.80 | 0.23 | 33.45 | true | 0.459150;0.457469;0.459946;0.456580;0.458019 | 200704;200704;200704;200704;200704 | 79648;73472;78816;72896;69632 | 803904;803904;803904;803904;803904 |
140 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 160 7 7]] | 31.333 | 4014080 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 14.00 | 1003520 | 4015637.33 | 3076896.00 | 44.70 | 0.14 | 71.68 | true | 0.451245;0.444804;0.447819;0.444149;0.447665 | 1003520;1003520;1003520;1003520;1003520 | 3095200;3070272;3083968;3066208;3076448 | 4015552;4015648;4015648;4015744;4015616 |
141 | InceptionV1/InceptionV1/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 256 7 7]] | 38 | 6422528 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 20.00 | 1605632 | 6424106.67 | 6482592.00 | 45.80 | 0.12 | 80.28 | true | 0.457073;0.458088;0.457403;0.458429;0.456592 | 1605632;1605632;1605632;1605632;1605632 | 6423840;6424384;6424096;6423840;6424704 | 6463648;6483040;6483552;6481184;6489920 |
142 | InceptionV1/InceptionV1/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 7 7]] | 30.333 | 3211264 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 13.00 | 802816 | 3210101.33 | 3238090.67 | 45.40 | 0.12 | 61.76 | true | 0.454909;0.454186;0.452773;0.452284;0.454252 | 802816;802816;802816;802816;802816 | 3243520;3240032;3230720;3246976;3228928 | 3208128;3211584;3210752;3210432;3209120 |
143 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 32 7 7]] | 22.333 | 802816 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.67 | 0 | 802912.00 | 802666.67 | 60.90 | 0.00 | 0.00 | true | 0.605011;0.604554;0.610969;0.617001;0.609584 | 0;0;0;0;0 | 804000;800576;803424;790688;805152 | 802912;802912;802912;803168;802912 |
144 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 160 7 7]] | 29 | 4014080 | 0 | 40929792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 12.67 | 0 | 4014570.67 | 3704554.67 | 77.80 | 0.00 | 0.00 | true | 0.778975;0.774159;0.775294;0.780391;0.781568 | 0;0;0;0;0 | 4015424;4014144;4014080;4014144;4015936 | 3715936;3669056;3701536;3706752;3705376 |
145 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 177 | 3211264 | 3359232 | 44141056 | GPU_0_bfc | 147968 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 80.33 | 464027648 | 64480.00 | 3658592.00 | 7.90 | 124.64 | 5776.30 | false | 0.079644;0.078825;0.078673;0.078851;0.079103 | 464027648;464027648;464027648;464027648;464027648 | 61888;66784;62912;74784;63744 | 3664000;3654592;3633536;3660352;3660832 |
145 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 177 | 3211264 | 3359232 | 44141056 | GPU_0_bfc | 147968 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 148224.00 | 423338.67 | 42.70 | 0.00 | 0.00 | true | 0.426782;0.427401;0.427168;0.425253;0.428059 | 0;0;0;0;0 | 154368;147456;149504;147712;147456 | 390592;452096;435648;421312;413056 |
145 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 177 | 3211264 | 3359232 | 44141056 | GPU_0_bfc | 147968 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 1418.67 | 4.60 | 0.00 | 0.00 | true | 0.045533;0.045507;0.045507;0.045507;0.045533 | 0;0;0;0;0 | 0;1792;0;0;0 | 1408;2944;1408;1440;1280 |
146 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 7 7]] | 705.333 | 8028160 | 9871872 | 51366400 | GPU_0_bfc | 1843712 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 597.33 | 5784289280 | 761280.00 | 1115018.67 | 18.60 | 3082.82 | 9683.53 | false | 0.186755;0.185840;0.186233;0.185524;0.185835 | 5784289280;5784289280;5784289280;5784289280;5784289280 | 767648;186592;1138752;1249856;377440 | 1121280;280576;1663200;1691840;560576 |
146 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 7 7]] | 705.333 | 8028160 | 9871872 | 51366400 | GPU_0_bfc | 1843712 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 1843818.67 | 3166325.33 | 46.00 | 0.00 | 0.00 | true | 0.459251;0.461393;0.462630;0.459303;0.458285 | 0;0;0;0;0 | 1843776;1844672;1843712;1843968;1843456 | 3179168;3165888;3212736;3105312;3153920 |
146 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 320 7 7]] | 705.333 | 8028160 | 9871872 | 51366400 | GPU_0_bfc | 1843712 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 640.00 | 4.80 | 0.00 | 0.00 | true | 0.047873;0.047850;0.047903;0.047914;0.047936 | 0;0;0;0;0 | 736;608;576;576;3232 | 96;96;96;96;96 |
147 | InceptionV1/InceptionV1/Mixed_5b/Branch_2/Conv2d_0a_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 128 7 7]] | 36.667 | 3211264 | 0 | 47352320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 12.00 | 802816 | 3216106.67 | 3017301.33 | 45.30 | 0.13 | 66.90 | true | 0.452759;0.453435;0.455838;0.453060;0.453295 | 802816;802816;802816;802816;802816 | 3215968;3216064;3218144;3216000;3216256 | 3030368;3007616;3013920;3031008;2994784 |
148 | InceptionV1/InceptionV1/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 320 7 7]] | 41.333 | 8028160 | 0 | 47352320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 24.00 | 2007040 | 6771765.33 | 8164426.67 | 45.10 | 0.13 | 83.63 | true | 0.450097;0.450949;0.453741;0.452370;0.449174 | 2007040;2007040;2007040;2007040;2007040 | 6759488;6801664;6754144;6746272;6808544 | 8171936;8174240;8153536;8167808;8151968 |
149 | InceptionV1/InceptionV1/Mixed_5b/concat | ConcatV2 | [[128 832 7 7]] | 116 | 32112640 | 32112640 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 17.00 | 0 | 4951029.33 | 4992949.33 | 91.30 | 0.00 | 0.00 | true | 0.930608;0.944858;0.900314;0.899504;0.923782;0.942297;0.895031;0.897926;0.933251;0.942706;0.886729;0.884690;0.925608;0.935425;0.895203;0.893202;0.929875;0.941960;0.891510;0.893236 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 6422624;8028320;3211360;3213984;6422688;8028256;3211360;3211424;6422624;8028320;3211360;3211424;6422624;8028320;3211360;3211360;6422624;8028320;3211360;3211360 | 6501728;7993056;3206592;3253984;6497760;8061152;3154624;3237952;6500576;8018144;3190848;3226688;6522464;8003680;3187936;3218496;6509024;8032224;3168064;3247072 |
149 | InceptionV1/InceptionV1/Mixed_5b/concat | ConcatV2 | [[128 832 7 7]] | 116 | 32112640 | 32112640 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 16.64 | 0 | 4951029.33 | 4992949.33 | 91.30 | 0.00 | 0.00 | true | 0.930608;0.944858;0.900314;0.899504;0.923782;0.942297;0.895031;0.897926;0.933251;0.942706;0.886729;0.884690;0.925608;0.935425;0.895203;0.893202;0.929875;0.941960;0.891510;0.893236 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 6422624;8028320;3211360;3213984;6422688;8028256;3211360;3211424;6422624;8028320;3211360;3211424;6422624;8028320;3211360;3211360;6422624;8028320;3211360;3211360 | 6501728;7993056;3206592;3253984;6497760;8061152;3154624;3237952;6500576;8018144;3190848;3226688;6522464;8003680;3187936;3218496;6509024;8032224;3168064;3247072 |
149 | InceptionV1/InceptionV1/Mixed_5b/concat | ConcatV2 | [[128 832 7 7]] | 116 | 32112640 | 32112640 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 15.91 | 0 | 4951029.33 | 4992949.33 | 91.30 | 0.00 | 0.00 | true | 0.930608;0.944858;0.900314;0.899504;0.923782;0.942297;0.895031;0.897926;0.933251;0.942706;0.886729;0.884690;0.925608;0.935425;0.895203;0.893202;0.929875;0.941960;0.891510;0.893236 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 6422624;8028320;3211360;3213984;6422688;8028256;3211360;3211424;6422624;8028320;3211360;3211424;6422624;8028320;3211360;3211360;6422624;8028320;3211360;3211360 | 6501728;7993056;3206592;3253984;6497760;8061152;3154624;3237952;6500576;8018144;3190848;3226688;6522464;8003680;3187936;3218496;6509024;8032224;3168064;3247072 |
149 | InceptionV1/InceptionV1/Mixed_5b/concat | ConcatV2 | [[128 832 7 7]] | 116 | 32112640 | 32112640 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 15.91 | 0 | 4951029.33 | 4992949.33 | 91.30 | 0.00 | 0.00 | true | 0.930608;0.944858;0.900314;0.899504;0.923782;0.942297;0.895031;0.897926;0.933251;0.942706;0.886729;0.884690;0.925608;0.935425;0.895203;0.893202;0.929875;0.941960;0.891510;0.893236 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 6422624;8028320;3211360;3213984;6422688;8028256;3211360;3211424;6422624;8028320;3211360;3211424;6422624;8028320;3211360;3211360;6422624;8028320;3211360;3211360 | 6501728;7993056;3206592;3253984;6497760;8061152;3154624;3237952;6500576;8018144;3190848;3226688;6522464;8003680;3187936;3218496;6509024;8032224;3168064;3247072 |
150 | InceptionV1/InceptionV1/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 832 7 7]] | 73 | 32112640 | 0 | 58591744 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 54.00 | 0 | 20464106.67 | 20592437.33 | 92.00 | 0.00 | 0.00 | true | 0.912762;0.919606;0.922937;0.932008;0.918395 | 0;0;0;0;0 | 20549568;20605664;20605312;20574432;20597568 | 20463424;20465248;20467424;20463648;20460192 |
151 | InceptionV1/InceptionV1/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[128 832 7 7]] | 150 | 20873216 | 20873216 | 79464960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 117.00 | 5218304 | 20873408.00 | 24236074.67 | 65.40 | 0.12 | 44.60 | true | 0.654368;0.654491;0.654381;0.654482;0.654443 | 5218304;5218304;5218304;5218304;5218304 | 20873408;20873408;20873408;20873408;20873408 | 24246208;24228096;24232352;24248448;24229664 |
152 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 48 7 7]] | 202 | 1204224 | 1364480 | 80669184 | GPU_0_bfc | 160256 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 108.33 | 668745728 | 20717376.00 | 2898336.00 | 6.20 | 28.32 | 6173.06 | false | 0.062496;0.062496;0.062496;0.062496;0.062496 | 668745728;668745728;668745728;668745728;668745728 | 20721152;20714592;20721312;20716384;20710080 | 2897728;2895840;2901440;2903296;2883136 |
152 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 48 7 7]] | 202 | 1204224 | 1364480 | 80669184 | GPU_0_bfc | 160256 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 159936.00 | 182122.67 | 44.70 | 0.00 | 0.00 | true | 0.445871;0.448348;0.447290;0.446144;0.446355 | 0;0;0;0;0 | 159936;165056;159936;159936;159936 | 178400;187296;180320;187424;178752 |
152 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 48 7 7]] | 202 | 1204224 | 1364480 | 80669184 | GPU_0_bfc | 160256 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 778.67 | 4.90 | 0.00 | 0.00 | true | 0.048872;0.048849;0.048885;0.048831;0.048855 | 0;0;0;0;0 | 96;96;96;96;96 | 576;480;768;992;1024 |
153 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 7 7]] | 271.667 | 4816896 | 5456384 | 85486080 | GPU_0_bfc | 639488 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 177.33 | 2006237184 | 24549120.00 | 4688384.00 | 11.70 | 68.62 | 11313.39 | false | 0.117446;0.116539;0.116851;0.117135;0.116881 | 2006237184;2006237184;2006237184;2006237184;2006237184 | 4742848;4672768;4604480;4869344;4649536 | 24642432;24500704;24397408;24758336;24504224 |
153 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 7 7]] | 271.667 | 4816896 | 5456384 | 85486080 | GPU_0_bfc | 639488 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 639168.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.442224;0.442014;0.442904;0.442819;0.441430 | 0;0;0;0;0 | 0;0;128;0;0 | 639168;639168;639168;639168;644544 |
153 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 192 7 7]] | 271.667 | 4816896 | 5456384 | 85486080 | GPU_0_bfc | 639488 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 0.00 | 4.90 | 0.00 | 0.00 | true | 0.048787;0.048878;0.048787;0.048852;0.048797 | 0;0;0;0;0 | 96;7520;96;96;96 | 0;0;0;0;0 |
154 | InceptionV1/InceptionV1/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 440.667 | 9633792 | 10912256 | 95119872 | GPU_0_bfc | 1278464 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 343.00 | 4012474368 | 68570037.33 | 11450517.33 | 22.00 | 50.14 | 11698.18 | false | 0.220293;0.220670;0.220322;0.220516;0.220173 | 4012474368;4012474368;4012474368;4012474368;4012474368 | 69382816;66047520;66209760;72765376;70117536 | 11407232;11491872;11509824;11452448;11320160 |
154 | InceptionV1/InceptionV1/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 440.667 | 9633792 | 10912256 | 95119872 | GPU_0_bfc | 1278464 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1278442.67 | 1770794.67 | 45.80 | 0.00 | 0.00 | true | 0.458856;0.456505;0.462529;0.459309;0.457031 | 0;0;0;0;0 | 1278688;1278304;1278336;1278848;1278208 | 1721408;1760800;1920864;1619584;1830176 |
154 | InceptionV1/InceptionV1/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 440.667 | 9633792 | 10912256 | 95119872 | GPU_0_bfc | 1278464 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 565.33 | 4.90 | 0.00 | 0.00 | true | 0.048841;0.049695;0.050524;0.048872;0.048808 | 0;0;0;0;0 | 96;96;96;96;96 | 480;640;384;576;736 |
155 | InceptionV1/InceptionV1/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274.667 | 3211264 | 3637760 | 66218496 | GPU_0_bfc | 426496 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 178.00 | 1337491456 | 31150176.00 | 5178645.33 | 8.20 | 36.82 | 7514.00 | false | 0.081998;0.081865;0.082020;0.081613;0.082035 | 1337491456;1337491456;1337491456;1337491456;1337491456 | 31113344;31200800;31207616;31090400;31136384 | 5180928;5159168;5135296;5232192;5195840 |
155 | InceptionV1/InceptionV1/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274.667 | 3211264 | 3637760 | 66218496 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 427818.67 | 340629.33 | 44.90 | 0.00 | 0.00 | true | 0.448560;0.450666;0.448484;0.447446;0.451200 | 0;0;0;0;0 | 427328;430848;429216;426912;426912 | 353312;320512;363584;282432;348064 |
155 | InceptionV1/InceptionV1/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 274.667 | 3211264 | 3637760 | 66218496 | GPU_0_bfc | 426496 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 746.67 | 4.90 | 0.00 | 0.00 | true | 0.048852;0.048841;0.048797;0.048899;0.048852 | 0;0;0;0;0 | 96;96;96;96;96 | 608;736;736;6112;768 |
156 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 48 7 7]] | 29.333 | 1204224 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.67 | 301056 | 1204608.00 | 183936.00 | 45.50 | 0.22 | 45.16 | true | 0.454877;0.454494;0.453470;0.455250;0.455836 | 301056;301056;301056;301056;301056 | 1204608;1204608;1204608;1211520;1204608 | 183296;186560;175904;182912;185600 |
157 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 192 7 7]] | 34 | 4816896 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 16.33 | 1204224 | 4819669.33 | 4345504.00 | 45.00 | 0.13 | 73.73 | true | 0.448259;0.449994;0.446620;0.451117;0.452740 | 1204224;1204224;1204224;1204224;1204224 | 4819552;4819360;4819488;4819968;4824352 | 4341760;4350848;4350976;4343328;4342336 |
158 | InceptionV1/InceptionV1/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 384 7 7]] | 46.667 | 9633792 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 28.00 | 2408448 | 9635456.00 | 9622229.33 | 46.20 | 0.13 | 86.02 | true | 0.460949;0.461614;0.460810;0.463265;0.464689 | 2408448;2408448;2408448;2408448;2408448 | 9635456;9635456;9635456;9635456;9635712 | 9633024;9618336;9609152;9629248;9619104 |
159 | InceptionV1/InceptionV1/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[128 128 7 7]] | 29 | 3211264 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 13.00 | 802816 | 3211968.00 | 3231552.00 | 45.40 | 0.12 | 61.76 | true | 0.457405;0.453524;0.453070;0.454796;0.455070 | 802816;802816;802816;802816;802816 | 3211968;3211968;3211968;3212032;3211968 | 3233024;3228416;3245984;3233216;3222528 |
160 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[128 48 7 7]] | 22.667 | 1204224 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 6.00 | 0 | 1204320.00 | 1213280.00 | 52.70 | 0.00 | 0.00 | true | 0.525178;0.534684;0.527213;0.525783;0.529166 | 0;0;0;0;0 | 1211424;1208000;1220416;1207808;1221440 | 1204320;1204320;1204320;1204320;1204320 |
161 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[128 192 7 7]] | 30.333 | 4816896 | 0 | 45345280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 14.33 | 0 | 4816938.67 | 4555733.33 | 80.10 | 0.00 | 0.00 | true | 0.808061;0.800167;0.801913;0.799238;0.800295 | 0;0;0;0;0 | 4816960;4816896;4819008;4816896;4816960 | 4575744;4538656;4543232;4579584;4548224 |
162 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 196.667 | 3211264 | 3432960 | 48556544 | GPU_0_bfc | 221696 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 101.00 | 695238656 | 432128.00 | 3641514.67 | 8.10 | 170.67 | 6883.55 | false | 0.081718;0.081211;0.081508;0.081051;0.081653 | 695238656;695238656;695238656;695238656;695238656 | 432320;432032;429312;443840;432032 | 3628352;3647072;3645024;3632448;3650688 |
162 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 196.667 | 3211264 | 3432960 | 48556544 | GPU_0_bfc | 221696 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 445984.00 | 44.30 | 0.00 | 0.00 | true | 0.444301;0.443610;0.443118;0.441825;0.443347 | 0;0;0;0;0 | 221184;221184;221184;226304;221184 | 438880;455424;443616;451744;442592 |
162 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 128 7 7]] | 196.667 | 3211264 | 3432960 | 48556544 | GPU_0_bfc | 221696 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 1493.33 | 4.60 | 0.00 | 0.00 | true | 0.045493;0.045507;0.045507;0.045572;0.045560 | 0;0;0;0;0 | 0;0;0;0;2048 | 1408;1280;1792;1280;5376 |
163 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 1004 | 9633792 | 19662336 | 56986112 | GPU_0_bfc | 10028544 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 881.67 | 10107224064 | 668885.33 | 2100661.33 | 24.50 | 3649.41 | 11463.77 | false | 0.245241;0.245182;0.245737;0.244038;0.245110 | 10107224064;10107224064;10107224064;10107224064;10107224064 | 924032;577600;694400;558816;734656 | 2219328;1794624;2241728;2050976;2031680 |
163 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 1004 | 9633792 | 19662336 | 56986112 | GPU_0_bfc | 10028544 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 19.33 | 0 | 2659072.00 | 4278528.00 | 46.80 | 0.00 | 0.00 | true | 0.467556;0.466302;0.467381;0.468556;0.468834 | 0;0;0;0;0 | 4298912;4276544;4286112;4244288;4272928 | 2657344;2657024;2665408;2659008;2660864 |
163 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[128 384 7 7]] | 1004 | 9633792 | 19662336 | 56986112 | GPU_0_bfc | 10028544 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 9.67 | 4276224 | 699264.00 | 2984192.00 | 36.80 | 1.16 | 442.35 | true | 0.368769;0.369274;0.365100;0.370023;0.364282 | 4276224;4276224;4276224;4276224;4276224 | 2955168;3011424;3029536;2965952;2975200 | 722496;692736;703552;676224;701504 |
164 | InceptionV1/InceptionV1/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 128 7 7]] | 36.667 | 3211264 | 0 | 52169216 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 11.00 | 802816 | 3212480.00 | 1267669.33 | 45.50 | 0.18 | 72.98 | true | 0.452066;0.453862;0.454279;0.455795;0.455492 | 802816;802816;802816;802816;802816 | 3212480;3212736;3212480;3212480;3212480 | 1169984;1641568;1242592;1284800;1275616 |
165 | InceptionV1/InceptionV1/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[128 384 7 7]] | 46.667 | 9633792 | 0 | 52169216 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 28.67 | 2408448 | 9504981.33 | 9674666.67 | 46.20 | 0.13 | 84.01 | true | 0.462048;0.462599;0.461607;0.459685;0.461250 | 2408448;2408448;2408448;2408448;2408448 | 9491584;9471104;9512064;9518464;9511296 | 9637152;9691072;9658144;9674784;9692608 |
166 | InceptionV1/InceptionV1/Mixed_5c/concat | ConcatV2 | [[128 1024 7 7]] | 134.667 | 45359104 | 45359104 | 97528320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 20.73 | 0 | 6422810.67 | 6417760.00 | 92.60 | 0.00 | 0.00 | true | 0.945928;0.941874;0.898993;0.912609;0.942100;0.947023;0.904636;0.907417;0.943300;0.945446;0.900554;0.912701;0.946796;0.944119;0.906773;0.895656;0.945712;0.944192;0.896894;0.907106 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 9719392;9604192;3191616;3201440;9715424;9597408;3185856;3190080;9712992;9617376;3254144;3113792;9724384;9608544;3184512;3189376;9701984;9639264;3157408;3217696 | 9633952;9633952;3211360;3211424;9633888;9633952;3211360;3211424;9633952;9633952;3211360;3211360;9633888;9633952;3211360;3211424;9633952;9640864;3211360;3213152 |
166 | InceptionV1/InceptionV1/Mixed_5c/concat | ConcatV2 | [[128 1024 7 7]] | 134.667 | 45359104 | 45359104 | 97528320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 20.73 | 0 | 6422810.67 | 6417760.00 | 92.60 | 0.00 | 0.00 | true | 0.945928;0.941874;0.898993;0.912609;0.942100;0.947023;0.904636;0.907417;0.943300;0.945446;0.900554;0.912701;0.946796;0.944119;0.906773;0.895656;0.945712;0.944192;0.896894;0.907106 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 9719392;9604192;3191616;3201440;9715424;9597408;3185856;3190080;9712992;9617376;3254144;3113792;9724384;9608544;3184512;3189376;9701984;9639264;3157408;3217696 | 9633952;9633952;3211360;3211424;9633888;9633952;3211360;3211424;9633952;9633952;3211360;3211360;9633888;9633952;3211360;3211424;9633952;9640864;3211360;3213152 |
166 | InceptionV1/InceptionV1/Mixed_5c/concat | ConcatV2 | [[128 1024 7 7]] | 134.667 | 45359104 | 45359104 | 97528320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 19.27 | 0 | 6422810.67 | 6417760.00 | 92.60 | 0.00 | 0.00 | true | 0.945928;0.941874;0.898993;0.912609;0.942100;0.947023;0.904636;0.907417;0.943300;0.945446;0.900554;0.912701;0.946796;0.944119;0.906773;0.895656;0.945712;0.944192;0.896894;0.907106 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 9719392;9604192;3191616;3201440;9715424;9597408;3185856;3190080;9712992;9617376;3254144;3113792;9724384;9608544;3184512;3189376;9701984;9639264;3157408;3217696 | 9633952;9633952;3211360;3211424;9633888;9633952;3211360;3211424;9633952;9633952;3211360;3211360;9633888;9633952;3211360;3211424;9633952;9640864;3211360;3213152 |
166 | InceptionV1/InceptionV1/Mixed_5c/concat | ConcatV2 | [[128 1024 7 7]] | 134.667 | 45359104 | 45359104 | 97528320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 19.27 | 0 | 6422810.67 | 6417760.00 | 92.60 | 0.00 | 0.00 | true | 0.945928;0.941874;0.898993;0.912609;0.942100;0.947023;0.904636;0.907417;0.943300;0.945446;0.900554;0.912701;0.946796;0.944119;0.906773;0.895656;0.945712;0.944192;0.896894;0.907106 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 | 9719392;9604192;3191616;3201440;9715424;9597408;3185856;3190080;9712992;9617376;3254144;3113792;9724384;9608544;3184512;3189376;9701984;9639264;3157408;3217696 | 9633952;9633952;3211360;3211424;9633888;9633952;3211360;3211424;9633952;9633952;3211360;3211360;9633888;9633952;3211360;3211424;9633952;9640864;3211360;3213152 |
167 | InceptionV1/InceptionV1/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[128 1024 7 7]] | 86 | 45359104 | 0 | 71838208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 66.00 | 0 | 25401482.67 | 25526346.67 | 93.60 | 0.00 | 0.00 | true | 0.936344;0.938684;0.936949;0.935861;0.932392 | 0;0;0;0;0 | 25398624;25394912;25397408;25412320;25408416 | 25499456;25531936;25553216;25545440;25501664 |
168 | InceptionV1/Logits/AvgPool_0a_7x7/AvgPool | AvgPool | [[128 1024 1 1]] | 119.333 | 524288 | 524288 | 72362496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 81.33 | 9240832 | 54464576.00 | 3549568.00 | 63.00 | 0.16 | 113.62 | true | 0.629924;0.630209;0.630987;0.628818;0.629607 | 9240832;9240832;9240832;9240832;9240832 | 3577280;3543008;3536672;3547072;3558624 | 54458816;53971840;54648768;54528064;54406848 |
169 | InceptionV1/Logits/Conv2d_0c_1x1/convolution | Conv2D | [[128 1001 1 1]] | 325 | 512512 | 4612864 | 27515904 | GPU_0_bfc | 4100352 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 200.67 | 268697600 | 2047658.67 | 806517.33 | 6.20 | 94.14 | 1339.02 | false | 0.062472;0.062473;0.062473;0.062473;0.062473 | 268697600;268697600;268697600;268697600;268697600 | 2053120;2052992;2030208;2040448;2049536 | 809664;803616;805440;805504;808608 |
169 | InceptionV1/Logits/Conv2d_0c_1x1/convolution | Conv2D | [[128 1001 1 1]] | 325 | 512512 | 4612864 | 27515904 | GPU_0_bfc | 4100352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 28.00 | 0 | 4150485.33 | 1785365.33 | 42.80 | 0.00 | 0.00 | true | 0.430394;0.423479;0.429369;0.426752;0.429152 | 0;0;0;0;0 | 4151168;4149888;4149504;4150400;4152128 | 1786880;1790048;1784544;1779680;1784672 |
169 | InceptionV1/Logits/Conv2d_0c_1x1/convolution | Conv2D | [[128 1001 1 1]] | 325 | 512512 | 4612864 | 27515904 | GPU_0_bfc | 4100352 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 725.33 | 4.60 | 0.00 | 0.00 | true | 0.046371;0.046371;0.046464;0.046384;0.046358 | 0;0;0;0;0 | 96;96;96;96;96 | 640;896;640;640;896 |
170 | InceptionV1/Logits/Conv2d_0c_1x1/BiasAdd | BiasAdd | [[128 1001 1 1]] | 28.667 | 512512 | 0 | 26991616 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 6.00 | 128128 | 6080.00 | 74.67 | 44.30 | 20.82 | 21.35 | false | 0.443241;0.442902;0.447556;0.442398;0.443243 | 128128;128128;128128;128128;128128 | 6080;6080;6080;6080;6080 | 32;160;160;32;32 |
174 | InceptionV1/Logits/Predictions/Softmax | Softmax | [[128 1001]] | 68.667 | 512512 | 1036800 | 26991616 | GPU_0_bfc | 1036800 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 9.67 | 1291776 | 6656.00 | 85.33 | 6.20 | 191.62 | 133.63 | false | 0.062431;0.062437;0.062427;0.062408;0.062439 | 1291776;1291776;1291776;1291776;1291776 | 128;0;128;0;256 | 6656;6656;6656;6656;6656 |
174 | InceptionV1/Logits/Predictions/Softmax | Softmax | [[128 1001]] | 68.667 | 512512 | 1036800 | 26991616 | GPU_0_bfc | 1036800 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 6.00 | 3075072 | 2560.00 | 170.67 | 37.80 | 1126.12 | 512.51 | false | 0.379480;0.377736;0.377859;0.375962;0.399292 | 3075072;3075072;3075072;3075072;3075072 | 128;128;256;256;0 | 2560;2560;4864;2560;2560 |
174 | InceptionV1/Logits/Predictions/Softmax | Softmax | [[128 1001]] | 68.667 | 512512 | 1036800 | 26991616 | GPU_0_bfc | 1036800 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 5.00 | 0 | 3840.00 | 213.33 | 6.20 | 0.00 | 0.00 | true | 0.062397;0.062402;0.062399;0.062405;0.062366 | 0;0;0;0;0 | 3840;3840;3840;4096;3840 | 256;256;384;128;128 |
Showing 1 to 347 of 347 entries