GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[1 3 224 224]] | 80.667 | 602112 | 602112 | 45926912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 6.33 | 0 | 2901.33 | 454282.67 | 59.20 | 0.00 | 0.00 | true | 0.592059;0.591741;0.593083;0.590838;0.592419 | 0;0;0;0;0 | 403936;502240;471328;448704;442816 | 6912;3328;2560;2816;2560 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 277 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.56 | 45058048 | 398.22 | 391904.00 | 12.00 | 114.86 | 3323.85 | false | 0.120015;0.120184;0.119776;0.119568;0.121047;0.119279;0.122730;0.119529;0.118975;0.119777;0.121087;0.120253;0.120049;0.120778;0.120594 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 256;256;256;1536;256;256;1536;256;256;1536;256;256;1536;256;256 | 433152;384672;365504;436096;380704;365248;434624;381984;363712;435456;385440;371008;439104;382656;372896 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 277 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.44 | 45058048 | 398.22 | 391904.00 | 12.00 | 114.86 | 3351.54 | false | 0.120015;0.120184;0.119776;0.119568;0.121047;0.119279;0.122730;0.119529;0.118975;0.119777;0.121087;0.120253;0.120049;0.120778;0.120594 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 433152;384672;365504;436096;380704;365248;434624;381984;363712;435456;385440;371008;439104;382656;372896 | 256;256;256;1536;256;256;1536;256;256;1536;256;256;1536;256;256 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 277 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.44 | 45058048 | 398.22 | 391904.00 | 12.00 | 114.86 | 3351.54 | false | 0.120015;0.120184;0.119776;0.119568;0.121047;0.119279;0.122730;0.119529;0.118975;0.119777;0.121087;0.120253;0.120049;0.120778;0.120594 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 256;256;256;1536;256;256;1536;256;256;1536;256;256;1536;256;256 | 433152;384672;365504;436096;380704;365248;434624;381984;363712;435456;385440;371008;439104;382656;372896 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 277 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 6.00 | 0 | 1536.00 | 359594.67 | 47.40 | 0.00 | 0.00 | true | 0.473314;0.474128;0.474862;0.474482;0.476879 | 0;0;0;0;0 | 1280;2304;1536;1536;1536 | 425696;336896;364832;359456;354496 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 277 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 4821.33 | 2048.00 | 36.20 | 0.00 | 0.00 | true | 0.361787;0.362911;0.362382;0.362346;0.362586 | 0;0;0;0;0 | 4736;4992;4736;4992;4736 | 6912;256;128;5760;0 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 148.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 15.00 | 40140800 | 5461.33 | 2267594.67 | 7.80 | 17.66 | 2676.05 | false | 0.077697;0.077569;0.077816;0.077655;0.077656 | 40140800;40140800;40140800;40140800;40140800 | 4608;6912;5376;5376;5632 | 2272064;2253856;2272480;2272192;2258528 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 148.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 6144.00 | 42.67 | 36.50 | 0.00 | 0.00 | true | 0.364527;0.365467;0.365437;0.365494;0.365758 | 0;0;0;0;0 | 6144;6144;6144;6144;6144 | 0;128;128;0;0 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 148.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 6506.67 | 7.30 | 0.00 | 0.00 | true | 0.073328;0.073266;0.073330;0.073315;0.073257 | 0;0;0;0;0 | 256;0;0;0;0 | 7616;5184;6464;8640;5440 |
4 | InceptionV2/InceptionV2/Conv2d_1a_7x7/BiasAdd | BiasAdd | [[1 64 112 112]] | 36 | 3211264 | 0 | 47933952 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 9.00 | 802816 | 384.00 | 1476949.33 | 47.50 | 0.54 | 89.20 | true | 0.478023;0.474942;0.475704;0.473971;0.475321 | 802816;802816;802816;802816;802816 | 384;384;384;384;384 | 1473024;1472832;1462944;1494176;1484992 |
5 | InceptionV2/InceptionV2/Conv2d_1a_7x7/Relu | Relu | [[1 64 112 112]] | 26.667 | 3211264 | 0 | 47933952 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 6.00 | 0 | 1450.67 | 1072032.00 | 75.30 | 0.00 | 0.00 | true | 0.753498;0.751947;0.753447;0.752743;0.734464 | 0;0;0;0;0 | 1280;1536;1536;1280;1536 | 1084576;1048448;1055616;1111072;1075904 |
6 | InceptionV2/InceptionV2/MaxPool_2a_3x3/MaxPool | MaxPool | [[1 64 56 56]] | 51.333 | 802816 | 802816 | 48736768 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 200704 | 2880.00 | 965621.33 | 54.60 | 0.21 | 22.30 | true | 0.546174;0.546083;0.544716;0.546632;0.548411 | 200704;200704;200704;200704;200704 | 2368;2880;2880;2880;8000 | 950176;982272;967712;959040;970112 |
7 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 56 56]] | 104.333 | 802816 | 819200 | 46328320 | GPU_0_bfc | 16384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.00 | 25890816 | 448.00 | 820341.33 | 6.90 | 31.54 | 1991.60 | false | 0.069296;0.069081;0.068959;0.068879;0.069096 | 25890816;25890816;25890816;25890816;25890816 | 448;448;448;448;448 | 820192;820704;820096;820640;820192 |
7 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 56 56]] | 104.333 | 802816 | 819200 | 46328320 | GPU_0_bfc | 16384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 16469.33 | 384.00 | 41.80 | 0.00 | 0.00 | true | 0.419233;0.416814;0.420138;0.415273;0.417860 | 0;0;0;0;0 | 16384;16384;16640;16384;16640 | 384;384;384;384;512 |
8 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 56 56]] | 34.667 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 200704 | 512.00 | 810.67 | 66.30 | 151.74 | 40.14 | false | 0.661146;0.662329;0.663268;0.664087;0.663698 | 200704;200704;200704;200704;200704 | 512;512;512;512;544 | 512;1024;768;640;2656 |
9 | InceptionV2/InceptionV2/Conv2d_2b_1x1/Relu | Relu | [[1 64 56 56]] | 22.667 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.33 | 0 | 0.00 | 256.00 | 67.40 | 0.00 | 0.00 | true | 0.669987;0.674840;0.673451;0.674350;0.674116 | 0;0;0;0;0 | 384;256;256;256;256 | 0;0;0;0;2304 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 172 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 55.00 | 373309440 | 6741.33 | 2690261.33 | 21.00 | 138.42 | 6787.44 | false | 0.212656;0.208730;0.214599;0.207790;0.209403 | 373309440;373309440;373309440;373309440;373309440 | 6656;7424;6912;6656;6656 | 2719136;2663232;2688416;2627872;2781760 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 172 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 442400.00 | 5952.00 | 46.10 | 0.00 | 0.00 | true | 0.460765;0.460568;0.460271;0.459136;0.460988 | 0;0;0;0;0 | 442656;442400;442400;442400;442368 | 6848;5440;7712;5568;4992 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 172 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 1280.00 | 129642.67 | 7.50 | 5.44 | 142.54 | true | 0.075108;0.075024;0.075183;0.075172;0.075179 | 712704;712704;712704;712704;712704 | 1792;1792;1024;1024;1024 | 141184;130720;130464;127136;127744 |
11 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 56 56]] | 28 | 2408448 | 0 | 47131136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.33 | 602112 | 938.67 | 102282.67 | 73.20 | 5.83 | 95.08 | true | 0.729417;0.733522;0.731385;0.732766;0.733342 | 602112;602112;602112;602112;602112 | 1024;1024;768;6912;768 | 86016;77248;118176;112608;108224 |
12 | InceptionV2/InceptionV2/Conv2d_2c_3x3/Relu | Relu | [[1 192 56 56]] | 21.667 | 2408448 | 0 | 47131136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 1792.00 | 22101.33 | 64.80 | 0.00 | 0.00 | true | 0.648673;0.647822;0.649800;0.648549;0.645873 | 0;0;0;0;0 | 58752;14720;23040;5504;28544 | 0;0;6656;0;5376 |
13 | InceptionV2/InceptionV2/MaxPool_3a_3x3/MaxPool | MaxPool | [[1 192 28 28]] | 41 | 602112 | 602112 | 47733248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 150528 | 0.00 | 601898.67 | 60.30 | 0.25 | 21.50 | true | 0.551446;0.599051;0.604535;0.606534;0.608342 | 150528;150528;150528;150528;150528 | 256;0;0;0;0 | 601856;601920;601952;601920;601824 |
14 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 192 28 28]] | 39 | 602112 | 602112 | 45926912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 3956397 | 3498.67 | 602944.00 | 55.40 | 6.52 | 439.60 | true | 0.540969;0.549486;0.560300;0.561571;0.552144 | 3956397;3956397;3956397;3956397;3956397 | 2048;3584;3584;3584;3328 | 602944;602912;602944;602944;602944 |
15 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 100 | 200704 | 249856 | 46127616 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 16.00 | 19710976 | 853.33 | 249440.00 | 3.10 | 78.75 | 1231.94 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 19710976;19710976;19710976;19710976;19710976 | 3072;768;768;1024;768 | 249504;249408;249280;249408;249536 |
15 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 100 | 200704 | 249856 | 46127616 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 49152.00 | 384.00 | 41.30 | 0.00 | 0.00 | true | 0.413787;0.412229;0.412123;0.411667;0.416689 | 0;0;0;0;0 | 49152;49152;49152;49152;49152 | 512;384;384;384;384 |
16 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97.333 | 200704 | 249856 | 46328320 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 16.00 | 19710976 | 0.00 | 196064.00 | 3.10 | 100.53 | 1231.94 | false | 0.031244;0.031244;0.031244;0.031244;0.031244 | 19710976;19710976;19710976;19710976;19710976 | 196832;198368;184800;196576;194784 | 0;0;0;0;0 |
16 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97.333 | 200704 | 249856 | 46328320 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 49152.00 | 42.67 | 41.40 | 0.00 | 0.00 | true | 0.413514;0.413208;0.413259;0.414863;0.417370 | 0;0;0;0;0 | 128;0;0;128;0 | 49152;49152;49152;49152;49152 |
17 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97 | 200704 | 249856 | 46529024 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 16.00 | 19710976 | 0.00 | 186058.67 | 3.10 | 105.94 | 1231.94 | false | 0.031244;0.031244;0.031244;0.031244;0.031244 | 19710976;19710976;19710976;19710976;19710976 | 0;0;0;0;0 | 191520;191520;149280;192544;175136 |
17 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97 | 200704 | 249856 | 46529024 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 49152.00 | 128.00 | 41.20 | 0.00 | 0.00 | true | 0.412220;0.412255;0.412412;0.412244;0.412373 | 0;0;0;0;0 | 49152;49152;49408;49152;49152 | 128;0;1024;128;128 |
18 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 28 28]] | 97.667 | 100352 | 124928 | 46027264 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 16.33 | 9855488 | 0.00 | 586.67 | 3.10 | 16799.12 | 603.41 | false | 0.031244;0.031245;0.031244;0.031245;0.031244 | 9855488;9855488;9855488;9855488;9855488 | 544;800;672;544;544 | 256;0;0;0;0 |
18 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 28 28]] | 97.667 | 100352 | 124928 | 46027264 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 24576.00 | 0.00 | 41.50 | 0.00 | 0.00 | true | 0.416305;0.412113;0.415423;0.416312;0.412154 | 0;0;0;0;0 | 24576;24576;24576;24576;24576 | 0;0;0;128;0 |
19 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 27.333 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 50176 | 256.00 | 128.00 | 44.90 | 130.67 | 11.58 | false | 0.448620;0.449680;0.449159;0.448477;0.448958 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 256;128;128;128;128 |
20 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 21 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.40 | 196.00 | 12.54 | false | 0.442717;0.444437;0.444586;0.443829;0.444530 | 50176;50176;50176;50176;50176 | 256;6144;256;256;256 | 0;0;0;0;128 |
21 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20.333 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.40 | 196.00 | 12.54 | false | 0.442377;0.442994;0.444890;0.444041;0.444427 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;128;0 |
22 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 32 28 28]] | 20 | 100352 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 25088 | 128.00 | 42.67 | 43.50 | 147.00 | 5.79 | false | 0.433421;0.436390;0.435944;0.435701;0.434552 | 25088;25088;25088;25088;25088 | 128;384;128;128;128 | 0;384;128;0;0 |
23 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 20.333 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438476;0.439173;0.439024;0.438692;0.438352 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
24 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 19 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.434550;0.434949;0.434749;0.435143;0.434715 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 53329920 | 0.00 | 333258.67 | 12.50 | 160.03 | 2539.52 | false | 0.124662;0.124677;0.124674;0.124678;0.124667 | 53329920;53329920;53329920;53329920;53329920 | 0;0;0;0;0 | 334464;335168;308832;334176;331136 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 44.70 | 0.00 | 0.00 | true | 0.446878;0.449868;0.447679;0.447898;0.446900 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 1706.67 | 6.20 | 208.80 | 89.09 | false | 0.062278;0.062289;0.062289;0.062294;0.062283 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 1792;1792;1664;1664;1664 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 113 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 35553280 | 0.00 | 42.67 | 12.50 | 833273.49 | 1693.01 | false | 0.124668;0.124664;0.124663;0.124669;0.124650 | 35553280;35553280;35553280;35553280;35553280 | 0;128;128;0;0 | 0;0;0;0;0 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 113 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 147456.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.437429;0.434486;0.437816;0.433868;0.435396 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 0;0;0;0;128 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 113 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 0.00 | 0.00 | 6.20 | 0.00 | 59.39 | true | 0.062296;0.062301;0.062302;0.062300;0.062301 | 237568;237568;237568;237568;237568 | 0;0;0;128;0 | 0;0;0;0;2048 |
27 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 26 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 75264 | 384.00 | 0.00 | 45.00 | 196.00 | 17.37 | false | 0.451491;0.450634;0.450047;0.450230;0.449962 | 75264;75264;75264;75264;75264 | 128;0;0;0;0 | 384;384;384;384;384 |
28 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.40 | 196.00 | 12.54 | false | 0.444674;0.444485;0.443868;0.444577;0.444293 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;0;128 |
29 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 20.333 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438123;0.437875;0.437804;0.437675;0.438683 | 0;0;0;0;0 | 0;0;0;1792;0 | 0;0;0;128;0 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 132.667 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 28.00 | 79478784 | 0.00 | 77034.67 | 12.50 | 1031.73 | 2838.53 | false | 0.124755;0.124763;0.124757;0.124758;0.124765 | 79478784;79478784;79478784;79478784;79478784 | 78720;77344;75136;76800;76960 | 0;0;0;0;0 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 132.667 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 331776.00 | 85.33 | 44.90 | 0.00 | 0.00 | true | 0.449789;0.449918;0.449238;0.448688;0.448386 | 0;0;0;0;0 | 160;128;128;0;0 | 331776;331776;331776;331776;331776 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 132.667 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 118410.67 | 6.20 | 4.51 | 133.63 | true | 0.062236;0.062231;0.062233;0.062252;0.062229 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 117280;117984;118688;118688;118560 |
31 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 25.667 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 75264 | 384.00 | 0.00 | 45.10 | 196.00 | 16.13 | false | 0.450261;0.451319;0.451046;0.450371;0.450288 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;0;0;128;0 |
33 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 256 28 28]] | 23.667 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.33 | 0 | 0.00 | 168373.33 | 67.40 | 0.00 | 0.00 | true | 0.675790;0.674807;0.673555;0.674825;0.673251 | 0;0;0;0;0 | 0;0;0;2048;0 | 168096;169056;168096;168800;168224 |
34 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 256 28 28]] | 44 | 802816 | 802816 | 46328320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 4827031 | 0.00 | 41738.67 | 52.70 | 115.65 | 402.25 | false | 0.526655;0.525636;0.526626;0.527156;0.526699 | 4827031;4827031;4827031;4827031;4827031 | 41664;42016;41536;41408;42016 | 0;0;0;0;0 |
35 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 105 | 200704 | 266240 | 46529024 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 19.67 | 26264576 | 0.00 | 960.00 | 3.10 | 27358.93 | 1335.46 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 256;0;0;0;0 | 960;960;960;1088;960 |
35 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 105 | 200704 | 266240 | 46529024 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 65536.00 | 0.00 | 41.40 | 0.00 | 0.00 | true | 0.412958;0.417925;0.412431;0.412219;0.417016 | 0;0;0;0;0 | 65536;65536;65536;65536;65536 | 0;0;0;0;128 |
36 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 99.667 | 200704 | 266240 | 46729728 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 19.33 | 26264576 | 0.00 | 1333.33 | 3.10 | 19698.44 | 1358.54 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 0;2048;0;0;0 | 1376;1376;1248;1376;1248 |
36 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 99.667 | 200704 | 266240 | 46729728 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 65706.67 | 42.67 | 41.50 | 0.00 | 0.00 | true | 0.413571;0.416787;0.417371;0.415664;0.413594 | 0;0;0;0;0 | 65536;65536;66048;70656;65536 | 0;128;128;0;0 |
37 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 100.667 | 200704 | 266240 | 46930432 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 19.67 | 26264576 | 0.00 | 55872.00 | 3.10 | 470.08 | 1335.46 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 0;0;0;0;0 | 55872;55872;55872;55872;55872 |
37 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 100.667 | 200704 | 266240 | 46930432 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 65536.00 | 42.67 | 41.20 | 0.00 | 0.00 | true | 0.411698;0.412171;0.412741;0.412258;0.412094 | 0;0;0;0;0 | 65792;65536;65536;65536;65536 | 0;0;0;128;128 |
38 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98 | 200704 | 266240 | 46328320 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 19.67 | 26264576 | 0.00 | 48416.00 | 3.10 | 542.48 | 1335.46 | false | 0.031245;0.031245;0.031246;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 48544;48288;48416;48416;48416 | 0;0;0;0;0 |
38 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98 | 200704 | 266240 | 46328320 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 65536.00 | 42.67 | 41.50 | 0.00 | 0.00 | true | 0.417019;0.413123;0.414081;0.418707;0.413390 | 0;0;0;0;0 | 0;128;128;0;0 | 65536;65536;65536;65536;65536 |
39 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 28 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 50176 | 256.00 | 0.00 | 45.00 | 196.00 | 10.75 | false | 0.450666;0.449835;0.450353;0.450420;0.450130 | 50176;50176;50176;50176;50176 | 0;0;0;0;128 | 256;8704;256;256;256 |
40 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 21.333 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 170.67 | 44.40 | 117.60 | 12.54 | false | 0.443571;0.443762;0.444295;0.444891;0.444917 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 128;256;128;384;128 |
41 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20.333 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 42.67 | 44.40 | 168.00 | 12.54 | false | 0.442732;0.444180;0.444250;0.444180;0.442479 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;256;128;0;0 |
42 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 50176 | 256.00 | 48000.00 | 44.40 | 1.04 | 10.04 | true | 0.443703;0.445504;0.444940;0.443376;0.442927 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 48128;47616;48000;48000;48000 |
43 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 20.333 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438439;0.438803;0.438621;0.437704;0.438602 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
44 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 18.667 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.435363;0.435435;0.435518;0.435213;0.435594 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 53329920 | 0.00 | 106816.00 | 12.50 | 499.27 | 2539.52 | false | 0.124659;0.124683;0.124678;0.124655;0.124671 | 53329920;53329920;53329920;53329920;53329920 | 0;0;0;0;0 | 106816;106560;106816;106816;106944 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 42.67 | 44.70 | 0.00 | 0.00 | true | 0.446017;0.446895;0.447630;0.446394;0.446279 | 0;0;0;0;0 | 0;128;128;0;0 | 221184;221184;221184;221184;221184 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 117.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 213.33 | 6.20 | 1670.40 | 89.09 | false | 0.062286;0.062288;0.062299;0.062291;0.062285 | 356352;356352;356352;356352;356352 | 0;0;0;0;1792 | 256;256;128;256;128 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.667 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 53329920 | 0.00 | 0.00 | 12.50 | 0.00 | 2539.52 | true | 0.124678;0.124665;0.124682;0.124668;0.124667 | 53329920;53329920;53329920;53329920;53329920 | 128;0;0;0;0 | 0;0;0;6912;0 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.667 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 221184.00 | 0.00 | 44.60 | 0.00 | 0.00 | true | 0.448166;0.446137;0.445583;0.446325;0.445544 | 0;0;0;0;0 | 0;0;0;128;0 | 221184;221184;221184;221184;221184 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.667 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 42.67 | 6.20 | 8351.93 | 89.09 | false | 0.062279;0.062278;0.062285;0.062288;0.062279 | 356352;356352;356352;356352;356352 | 0;128;128;0;0 | 0;0;0;0;0 |
47 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 26.333 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 75264 | 384.00 | 0.00 | 45.10 | 196.00 | 17.37 | false | 0.450009;0.450578;0.450902;0.450889;0.451541 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;0;0;0;128 |
48 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 20.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 75264 | 384.00 | 0.00 | 44.60 | 196.00 | 17.37 | false | 0.446059;0.445718;0.446104;0.447025;0.446848 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;0;0;128;0 |
49 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 19.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 42.67 | 43.90 | 0.00 | 0.00 | true | 0.438809;0.438329;0.438914;0.438726;0.438203 | 0;0;0;0;0 | 0;128;384;0;0 | 0;0;0;0;0 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 120 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 28.00 | 79478784 | 0.00 | 77557.33 | 12.50 | 1024.77 | 2838.53 | false | 0.124760;0.124752;0.124756;0.124750;0.124753 | 79478784;79478784;79478784;79478784;79478784 | 0;0;0;0;0 | 77856;76736;76960;78624;77856 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 120 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 331776.00 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.449216;0.444675;0.448655;0.449036;0.448639 | 0;0;0;0;0 | 331776;331776;337152;331776;331776 | 128;0;0;0;0 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 120 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 534528 | 0.00 | 29770.67 | 6.20 | 17.95 | 123.36 | false | 0.062243;0.062243;0.062245;0.062235;0.062242 | 534528;534528;534528;534528;534528 | 0;0;0;2560;0 | 29856;29344;30752;29472;29984 |
51 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 25.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 75264 | 384.00 | 42.67 | 45.10 | 176.40 | 17.37 | false | 0.450457;0.450473;0.451561;0.450770;0.450627 | 75264;75264;75264;75264;75264 | 384;384;5504;384;384 | 0;128;128;0;0 |
53 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 320 28 28]] | 23 | 1003520 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 0.00 | 70485.33 | 64.10 | 0.00 | 0.00 | true | 0.640965;0.627100;0.641582;0.641323;0.643163 | 0;0;0;0;0 | 70784;69376;71424;70016;70656 | 0;0;0;0;0 |
54 | InceptionV2/InceptionV2/Mixed_4a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 320 14 14]] | 37.667 | 250880 | 250880 | 45977088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.67 | 62720 | 256.00 | 42.67 | 29.60 | 210.00 | 11.07 | false | 0.293756;0.296141;0.295483;0.296929;0.296023 | 62720;62720;62720;62720;62720 | 256;256;256;256;256 | 0;128;0;0;128 |
55 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 104.333 | 200704 | 282624 | 46177792 | GPU_0_bfc | 81920 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 32818176 | 0.00 | 362.67 | 3.10 | 90491.21 | 1426.88 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 32818176;32818176;32818176;32818176;32818176 | 320;1088;448;320;320 | 256;0;0;0;0 |
55 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 104.333 | 200704 | 282624 | 46177792 | GPU_0_bfc | 81920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 81920.00 | 0.00 | 41.90 | 0.00 | 0.00 | true | 0.419913;0.418488;0.418353;0.418122;0.418768 | 0;0;0;0;0 | 81920;81920;81920;81920;81920 | 0;0;0;128;0 |
56 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 28 28]] | 113.667 | 401408 | 565248 | 46579200 | GPU_0_bfc | 163840 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.33 | 65636352 | 0.00 | 2453.33 | 3.90 | 26753.95 | 1969.11 | false | 0.039036;0.039666;0.039182;0.039380;0.039369 | 65636352;65636352;65636352;65636352;65636352 | 0;0;0;0;0 | 2368;2496;2368;2496;2496 |
56 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 28 28]] | 113.667 | 401408 | 565248 | 46579200 | GPU_0_bfc | 163840 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 163840.00 | 0.00 | 41.90 | 0.00 | 0.00 | true | 0.417887;0.418861;0.417750;0.418927;0.420058 | 0;0;0;0;0 | 163840;163840;163840;163840;163840 | 128;0;0;0;0 |
57 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 27.333 | 200704 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 45.00 | 196.00 | 12.54 | false | 0.449847;0.449844;0.450097;0.448279;0.449665 | 50176;50176;50176;50176;50176 | 256;1024;256;256;256 | 0;0;0;128;0 |
58 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 28 28]] | 20.333 | 401408 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 100352 | 512.00 | 42.67 | 50.30 | 180.92 | 25.09 | false | 0.502735;0.502667;0.502817;0.502953;0.501656 | 100352;100352;100352;100352;100352 | 512;512;512;512;512 | 0;128;128;0;0 |
59 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 21.333 | 200704 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438028;0.439016;0.438253;0.438992;0.438807 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
60 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 128 28 28]] | 18.333 | 401408 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 52.80 | 0.00 | 0.00 | true | 0.527749;0.528118;0.527285;0.527725;0.526995 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 53329920 | 0.00 | 140746.67 | 12.50 | 378.91 | 2539.52 | false | 0.124671;0.124666;0.124677;0.124673;0.124668 | 53329920;53329920;53329920;53329920;53329920 | 0;0;0;0;0 | 140576;141856;139808;140960;140704 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 45.20 | 0.00 | 0.00 | true | 0.451638;0.450926;0.452828;0.452075;0.453000 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 1237.33 | 6.20 | 288.00 | 89.09 | false | 0.062307;0.062308;0.062310;0.062317;0.062298 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 1152;1280;1280;1280;1152 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 162.333 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.67 | 82606720 | 0.00 | 618.67 | 3.10 | 133523.72 | 1203.00 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 82606720;82606720;82606720;82606720;82606720 | 0;0;0;0;0 | 576;704;448;576;2624 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 162.333 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 737280.00 | 57472.00 | 44.40 | 0.00 | 0.00 | true | 0.444048;0.446063;0.443195;0.444180;0.445053 | 0;0;0;0;0 | 737280;737280;737280;737280;739328 | 55168;58624;56064;58496;57856 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 162.333 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 6.00 | 0 | 768.00 | 256.00 | 47.80 | 0.00 | 0.00 | true | 0.477616;0.476655;0.477507;0.478309;0.477507 | 0;0;0;0;0 | 768;768;768;1024;768 | 256;256;256;256;384 |
63 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 26.667 | 301056 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 384.00 | 45.10 | 98.00 | 18.82 | false | 0.450718;0.450493;0.451759;0.450313;0.450742 | 75264;75264;75264;75264;75264 | 384;384;2432;384;384 | 384;384;2432;384;384 |
64 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 20.667 | 200704 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 31360 | 640.00 | 256.00 | 44.60 | 35.00 | 6.72 | false | 0.446742;0.446163;0.445768;0.446158;0.446802 | 31360;31360;31360;31360;31360 | 640;0;768;0;128 | 640;640;640;640;640 |
65 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 20 | 301056 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 725.33 | 43.80 | 0.00 | 0.00 | true | 0.438615;0.438324;0.438161;0.437396;0.438119 | 0;0;0;0;0 | 0;256;0;0;0 | 0;0;2048;128;2560 |
66 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 160 14 14]] | 18.333 | 200704 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 896.00 | 43.50 | 0.00 | 0.00 | true | 0.435270;0.435511;0.435718;0.435205;0.434660 | 0;0;0;0;0 | 0;0;0;0;2304 | 2048;3200;0;640;0 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 144 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 37177728 | 0.00 | 362.67 | 3.10 | 102512.02 | 701.47 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 37177728;37177728;37177728;37177728;37177728 | 320;320;10560;448;320 | 0;0;0;0;0 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 144 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 331776.00 | 85941.33 | 44.50 | 0.00 | 0.00 | true | 0.444885;0.445613;0.445875;0.444237;0.444226 | 0;0;0;0;0 | 331776;331776;338432;331776;331776 | 88192;85248;83840;88704;84384 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 144 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 5.00 | 0 | 0.00 | 256.00 | 47.70 | 0.00 | 0.00 | true | 0.477232;0.477158;0.477224;0.477385;0.477148 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
68 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 26.333 | 75264 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 42.67 | 44.70 | 44.10 | 4.70 | false | 0.446773;0.446515;0.446558;0.446908;0.446004 | 18816;18816;18816;18816;18816 | 384;384;640;384;384 | 0;128;3840;0;0 |
69 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 96 14 14]] | 20 | 75264 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.436874;0.437005;0.436809;0.437016;0.437309 | 0;0;0;0;0 | 0;0;0;3840;0 | 0;0;0;0;0 |
71 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 41 | 526848 | 526848 | 45701120 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.33 | 2893038 | 0.00 | 104864.00 | 43.40 | 27.59 | 347.18 | false | 0.432672;0.432309;0.435802;0.432644;0.436004 | 2893038;2893038;2893038;2893038;2893038 | 0;0;0;0;0 | 104864;104864;104864;104992;104576 |
72 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116.333 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 24791424 | 0.00 | 1002.67 | 3.10 | 24725.48 | 670.04 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 1856;320;5824;448;704 |
72 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116.333 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 221184.00 | 8405.33 | 44.50 | 0.00 | 0.00 | true | 0.444444;0.444504;0.446241;0.443414;0.445493 | 0;0;0;0;0 | 8320;8576;8320;8064;8704 | 221184;221184;226304;221184;221184 |
73 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 14 14]] | 113.333 | 50176 | 197632 | 45826560 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 16527616 | 0.00 | 288.00 | 3.10 | 57387.56 | 454.89 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 16527616;16527616;16527616;16527616;16527616 | 1824;288;288;288;288 | 256;0;0;0;0 |
73 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 14 14]] | 113.333 | 50176 | 197632 | 45826560 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 147456.00 | 84138.67 | 43.30 | 0.00 | 0.00 | true | 0.429790;0.435116;0.431231;0.433409;0.433078 | 0;0;0;0;0 | 147456;147456;149504;147456;147456 | 85760;87552;61312;81408;85248 |
74 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 14 14]] | 115.667 | 175616 | 691712 | 46002176 | GPU_0_bfc | 516096 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.67 | 57846656 | 0.00 | 288.00 | 3.10 | 200856.44 | 1577.62 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 57846656;57846656;57846656;57846656;57846656 | 0;0;0;0;0 | 288;288;288;416;288 |
74 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 14 14]] | 115.667 | 175616 | 691712 | 46002176 | GPU_0_bfc | 516096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 516096.00 | 2901.33 | 45.40 | 0.00 | 0.00 | true | 0.454035;0.453888;0.456136;0.454458;0.454624 | 0;0;0;0;0 | 516096;516096;516096;516096;516096 | 1344;2976;2880;2944;2880 |
75 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 113 | 100352 | 451584 | 45650944 | GPU_0_bfc | 351232 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 33055232 | 0.00 | 288.00 | 3.10 | 114775.11 | 909.79 | false | 0.031247;0.031247;0.031247;0.031248;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;0;0;0 | 288;288;288;288;288 |
75 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 113 | 100352 | 451584 | 45650944 | GPU_0_bfc | 351232 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.437099;0.440920;0.441086;0.439496;0.437806 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 0;128;0;0;0 |
76 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 27.333 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 18816 | 554.67 | 0.00 | 44.60 | 33.92 | 4.03 | false | 0.446179;0.446565;0.446098;0.445518;0.447298 | 18816;18816;18816;18816;18816 | 896;14208;384;384;384 | 0;0;0;0;0 |
77 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 14 14]] | 20.333 | 50176 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12544 | 256.00 | 0.00 | 42.00 | 49.00 | 3.14 | false | 0.420096;0.419485;0.418907;0.420510;0.420995 | 12544;12544;12544;12544;12544 | 256;256;256;256;256 | 0;0;0;128;0 |
78 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 14 14]] | 20.333 | 175616 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 43904 | 896.00 | 0.00 | 45.00 | 49.00 | 10.98 | false | 0.450957;0.450539;0.450867;0.449255;0.449311 | 43904;43904;43904;43904;43904 | 896;896;896;896;896 | 0;128;0;0;0 |
79 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19.333 | 100352 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.50 | 49.00 | 6.27 | false | 0.434768;0.435712;0.435539;0.435255;0.432892 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;0;0 |
80 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 19.333 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.437091;0.436935;0.436693;0.437572;0.436144 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
81 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 14 14]] | 18.333 | 50176 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 42.90 | 0.00 | 0.00 | true | 0.428519;0.428889;0.428941;0.428829;0.429061 | 0;0;0;0;0 | 0;0;0;128;0 | 0;0;0;0;0 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 125.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 27.67 | 26492928 | 0.00 | 192.00 | 12.50 | 137984.00 | 957.56 | false | 0.124788;0.124807;0.124789;0.124792;0.124755 | 26492928;26492928;26492928;26492928;26492928 | 0;0;0;0;0 | 192;192;192;192;192 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 125.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 442368.00 | 2688.00 | 44.70 | 0.00 | 0.00 | true | 0.446934;0.447088;0.446755;0.447812;0.448346 | 0;0;0;0;0 | 442368;442368;442368;442368;444416 | 2688;2688;2688;2560;2688 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 125.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 0.00 | 42.67 | 7.50 | 16703.87 | 152.71 | false | 0.075113;0.075118;0.075293;0.075322;0.075278 | 712704;712704;712704;712704;712704 | 0;0;0;0;0 | 0;128;0;128;0 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 111.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 13332480 | 0.00 | 0.00 | 12.50 | 0.00 | 634.88 | true | 0.124712;0.124714;0.124728;0.124706;0.124724 | 13332480;13332480;13332480;13332480;13332480 | 0;0;0;0;5376 | 0;0;0;0;0 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 111.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 5205.33 | 44.50 | 0.00 | 0.00 | true | 0.445338;0.445688;0.446770;0.444789;0.445025 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 5120;5120;5248;5248;5376 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 111.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 42.67 | 6.20 | 8351.93 | 89.09 | false | 0.062265;0.062273;0.062258;0.062268;0.062266 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 0;128;0;0;1536 |
84 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 26 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 25088 | 512.00 | 0.00 | 43.80 | 49.00 | 5.79 | false | 0.437680;0.438504;0.437230;0.437789;0.437584 | 25088;25088;25088;25088;25088 | 0;0;0;0;0 | 512;512;512;512;512 |
85 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 22.667 | 75264 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 0.00 | 44.30 | 49.00 | 4.70 | false | 0.443158;0.442495;0.442211;0.443299;0.439956 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;0;128;0 |
86 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 128 14 14]] | 20 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.435646;0.435927;0.435778;0.436004;0.435197 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;0;0 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 128.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 35209216 | 0.00 | 204320.00 | 12.50 | 172.32 | 1015.64 | false | 0.124844;0.124847;0.124843;0.124842;0.124855 | 35209216;35209216;35209216;35209216;35209216 | 0;0;0;0;0 | 204800;204064;204320;203168;204576 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 128.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 589824.00 | 59669.33 | 43.80 | 0.00 | 0.00 | true | 0.437601;0.438229;0.436731;0.438027;0.438238 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 59648;59776;59584;59904;58112 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 128.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 950272 | 0.00 | 36970.67 | 9.70 | 25.70 | 190.05 | false | 0.097508;0.097105;0.097082;0.097194;0.097377 | 950272;950272;950272;950272;950272 | 0;0;0;0;0 | 36416;36768;36704;37664;37440 |
88 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 26 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 25088 | 512.00 | 0.00 | 43.80 | 49.00 | 5.79 | false | 0.438333;0.437906;0.437607;0.436361;0.437363 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;128;0;0;0 |
90 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 22.667 | 526848 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 26581.33 | 58.50 | 0.00 | 0.00 | true | 0.584270;0.585283;0.584951;0.575763;0.584405 | 0;0;0;0;0 | 0;0;0;0;0 | 26368;27136;26240;22688;28288 |
91 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 40.333 | 451584 | 451584 | 45701120 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 2514363 | 0.00 | 165920.00 | 42.10 | 15.15 | 279.37 | true | 0.420694;0.420839;0.419884;0.420778;0.421264 | 2514363;2514363;2514363;2514363;2514363 | 0;0;0;0;0 | 166240;165120;166400;168864;164288 |
92 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 24791424 | 0.00 | 107936.00 | 3.10 | 229.69 | 682.34 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 108256;107552;108000;107040;108384 |
92 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221269.33 | 298.67 | 44.50 | 0.00 | 0.00 | true | 0.444888;0.444434;0.444058;0.447021;0.444342 | 0;0;0;0;0 | 221440;222976;221184;221184;221184 | 256;384;256;512;256 |
93 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114 | 75264 | 296448 | 45851648 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 24791424 | 0.00 | 38090.67 | 3.10 | 650.85 | 682.34 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;7424;0 | 38560;38560;38432;36512;37280 |
93 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114 | 75264 | 296448 | 45851648 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 221184.00 | 5333.33 | 44.40 | 0.00 | 0.00 | true | 0.443772;0.444256;0.442621;0.443525;0.443393 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 5120;4736;4864;6016;6144 |
94 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 117.333 | 150528 | 592896 | 46002176 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 49582848 | 0.00 | 38048.00 | 3.10 | 1303.17 | 1340.08 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 49582848;49582848;49582848;49582848;49582848 | 0;0;0;256;0 | 45216;39456;38688;34720;36000 |
94 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 117.333 | 150528 | 592896 | 46002176 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 442368.00 | 99552.00 | 46.00 | 0.00 | 0.00 | true | 0.460775;0.460984;0.460171;0.460208;0.459862 | 0;0;0;0;0 | 93216;97888;99680;101088;103904 | 442368;442368;442368;442368;442368 |
95 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 113 | 100352 | 526848 | 45575680 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 33055232 | 0.00 | 288.00 | 3.10 | 114775.11 | 893.38 | false | 0.031247;0.031247;0.031247;0.031247;0.031248 | 33055232;33055232;33055232;33055232;33055232 | 0;1792;0;0;0 | 288;288;288;288;416 |
95 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 113 | 100352 | 526848 | 45575680 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 4096.00 | 45.00 | 0.00 | 0.00 | true | 0.451183;0.450314;0.451941;0.449338;0.449730 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 4224;4096;4096;4096;4096 |
96 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 28.333 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 18816 | 384.00 | 128.00 | 44.60 | 36.75 | 3.76 | false | 0.446218;0.446365;0.446834;0.445685;0.446680 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 128;128;128;256;128 |
97 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 21 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 18816 | 384.00 | 0.00 | 44.30 | 49.00 | 4.34 | false | 0.441146;0.442497;0.442872;0.442539;0.443688 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;128;0;0;0 |
98 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 20.333 | 150528 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 37632 | 768.00 | 0.00 | 44.30 | 49.00 | 8.68 | false | 0.442954;0.442959;0.442529;0.442501;0.442775 | 37632;37632;37632;37632;37632 | 768;768;768;768;5888 | 128;0;0;0;0 |
99 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19.667 | 100352 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 25088 | 512.00 | 0.00 | 43.50 | 49.00 | 5.38 | false | 0.435355;0.434369;0.436986;0.433067;0.435953 | 25088;25088;25088;25088;25088 | 512;512;512;512;2560 | 0;0;0;0;128 |
100 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 19.667 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.436647;0.436525;0.436725;0.436663;0.436717 | 0;0;0;0;0 | 0;0;0;128;0 | 0;0;0;0;0 |
101 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 19 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.40 | 0.00 | 0.00 | true | 0.433643;0.433809;0.433612;0.433485;0.433892 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;0;0 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 124 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 28.00 | 26492928 | 0.00 | 54464.00 | 12.50 | 486.43 | 946.18 | false | 0.124802;0.124816;0.124800;0.124812;0.124812 | 26492928;26492928;26492928;26492928;26492928 | 55488;53504;54400;52192;56160 | 0;0;0;0;5376 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 124 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 442368.00 | 28586.67 | 44.80 | 0.00 | 0.00 | true | 0.448446;0.446343;0.448433;0.448507;0.448463 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 29312;28416;28672;28416;28672 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 124 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 712704 | 0.00 | 116853.33 | 7.50 | 6.10 | 164.48 | true | 0.074852;0.075154;0.075054;0.075057;0.074944 | 712704;712704;712704;712704;712704 | 114816;118912;116960;118016;115584 | 0;0;0;0;0 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 121 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 28.00 | 26492928 | 0.00 | 0.00 | 12.50 | 0.00 | 946.18 | true | 0.124802;0.124794;0.124795;0.124798;0.124778 | 26492928;26492928;26492928;26492928;26492928 | 0;0;2048;0;0 | 0;0;0;0;128 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 121 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 442368.00 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.448545;0.449983;0.448803;0.447342;0.450384 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 0;128;0;0;0 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 121 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 0.00 | 0.00 | 7.60 | 0.00 | 152.71 | true | 0.075574;0.075494;0.075570;0.075548;0.075536 | 712704;712704;712704;712704;712704 | 1792;0;0;0;0 | 128;0;0;0;0 |
104 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 25.667 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.80 | 49.00 | 6.27 | false | 0.438422;0.436894;0.438161;0.436693;0.437848 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;128;0 |
105 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 20 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.60 | 49.00 | 6.27 | false | 0.436008;0.435893;0.437586;0.435507;0.436064 | 25088;25088;25088;25088;25088 | 512;512;1536;512;512 | 0;128;0;0;0 |
106 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 128 14 14]] | 20 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 43.40 | 0.00 | 0.00 | true | 0.434457;0.434173;0.434134;0.434429;0.434620 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 35209216 | 0.00 | 183882.67 | 12.50 | 191.48 | 1015.64 | false | 0.124847;0.124848;0.124835;0.124848;0.124833 | 35209216;35209216;35209216;35209216;35209216 | 0;0;0;0;0 | 179232;186400;185504;180128;186016 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.67 | 0 | 589824.00 | 213.33 | 43.60 | 0.00 | 0.00 | true | 0.437381;0.435829;0.437051;0.434782;0.436182 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 128;256;256;128;256 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 950272 | 0.00 | 45312.00 | 9.80 | 20.97 | 190.05 | false | 0.097752;0.097627;0.097585;0.097483;0.097518 | 950272;950272;950272;950272;950272 | 51456;44160;45184;46208;44544 | 6656;0;0;0;0 |
108 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 25.333 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 25088 | 512.00 | 0.00 | 43.80 | 49.00 | 5.02 | false | 0.438939;0.438559;0.437035;0.436599;0.439053 | 25088;25088;25088;25088;25088 | 512;512;512;2304;512 | 128;0;0;0;0 |
110 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 23 | 727552 | 0 | 45450240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 58.40 | 0.00 | 0.00 | true | 0.584423;0.583766;0.585061;0.583865;0.584633 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
111 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 40.667 | 451584 | 451584 | 45901824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.67 | 2461563 | 0.00 | 128.00 | 42.40 | 19230.96 | 254.64 | false | 0.423168;0.424348;0.423921;0.422629;0.423808 | 2461563;2461563;2461563;2461563;2461563 | 0;0;0;0;0 | 128;128;128;256;128 |
112 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 110 | 100352 | 401408 | 46002176 | GPU_0_bfc | 301056 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.67 | 33055232 | 0.00 | 800.00 | 3.10 | 41319.04 | 901.50 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;0;0;0 | 928;800;800;800;800 |
112 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 110 | 100352 | 401408 | 46002176 | GPU_0_bfc | 301056 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 294912.00 | 128.00 | 44.80 | 0.00 | 0.00 | true | 0.449931;0.448321;0.442526;0.448399;0.448638 | 0;0;0;0;0 | 128;128;128;128;128 | 294912;294912;294912;294912;294912 |
113 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 116 | 100352 | 395264 | 46102528 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 33055232 | 0.00 | 1525.33 | 3.10 | 21670.83 | 893.38 | false | 0.031248;0.031248;0.031247;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;1536;0;0;0 | 32;3104;1568;2976;32 |
113 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 116 | 100352 | 395264 | 46102528 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 294912.00 | 85546.67 | 44.60 | 0.00 | 0.00 | true | 0.451549;0.442987;0.452767;0.443231;0.442445 | 0;0;0;0;0 | 294912;294912;294912;299008;294912 | 85504;85760;85504;85504;85632 |
114 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 114.667 | 200704 | 569344 | 46303232 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 41319040 | 0.00 | 32.00 | 3.10 | 1291220.00 | 1116.73 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 41319040;41319040;41319040;41319040;41319040 | 0;0;0;0;0 | 160;32;32;32;32 |
114 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 114.667 | 200704 | 569344 | 46303232 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 368640.00 | 99029.33 | 46.30 | 0.00 | 0.00 | true | 0.462768;0.463338;0.462604;0.462507;0.462854 | 0;0;0;0;0 | 368640;368640;368640;368640;368640 | 100480;97408;98944;97664;100480 |
115 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114 | 75264 | 296448 | 45650944 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 24791424 | 0.00 | 74.67 | 3.10 | 332026.52 | 670.04 | false | 0.031248;0.031248;0.031247;0.031248;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 32;32;32;160;6560 |
115 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114 | 75264 | 296448 | 45650944 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 103210.67 | 44.70 | 0.00 | 0.00 | true | 0.447818;0.447883;0.446929;0.446582;0.445284 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 103168;103040;103168;103296;103296 |
116 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 27.333 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 25088 | 512.00 | 0.00 | 44.00 | 49.00 | 5.79 | false | 0.440319;0.440469;0.440142;0.439582;0.440741 | 25088;25088;25088;25088;25088 | 512;512;512;512;2560 | 0;0;0;0;0 |
117 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 20 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 25088 | 512.00 | 0.00 | 43.60 | 49.00 | 5.38 | false | 0.436215;0.436278;0.437020;0.434685;0.436918 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 128;0;0;0;0 |
118 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 20.333 | 200704 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 31360 | 640.00 | 0.00 | 44.80 | 49.00 | 6.72 | false | 0.448193;0.447858;0.447212;0.447262;0.447794 | 31360;31360;31360;31360;31360 | 0;0;0;0;12672 | 640;640;640;640;640 |
119 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 20 | 75264 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 18816 | 384.00 | 0.00 | 44.30 | 49.00 | 4.34 | false | 0.442904;0.443787;0.443797;0.442653;0.443685 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;0;128;0 |
120 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 20 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.437491;0.436867;0.438095;0.438132;0.437082 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
121 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 19.333 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.30 | 0.00 | 0.00 | true | 0.432715;0.431778;0.432265;0.433159;0.432776 | 0;0;0;0;0 | 0;0;0;0;3840 | 128;0;0;0;0 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 133.667 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 44011520 | 0.00 | 50133.33 | 12.50 | 877.89 | 1257.47 | false | 0.124846;0.124850;0.124851;0.124842;0.124843 | 44011520;44011520;44011520;44011520;44011520 | 4864;0;0;0;0 | 51840;48768;50432;45440;51200 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 133.667 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 737280.00 | 60789.33 | 44.30 | 0.00 | 0.00 | true | 0.442816;0.442836;0.443432;0.445674;0.441452 | 0;0;0;0;0 | 737280;737280;737280;737280;737280 | 60192;61088;61088;61216;41248 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 133.667 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1187840 | 0.00 | 71584.00 | 11.20 | 16.59 | 237.57 | true | 0.112427;0.111711;0.111875;0.111722;0.111735 | 1187840;1187840;1187840;1187840;1187840 | 68864;73632;70592;73952;70528 | 0;0;0;0;0 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 129 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 44011520 | 0.00 | 26784.00 | 12.50 | 1643.20 | 1269.55 | false | 0.124851;0.124829;0.124853;0.124842;0.124850 | 44011520;44011520;44011520;44011520;44011520 | 0;0;0;0;0 | 22048;24736;27552;28064;29216 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 129 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 737280.00 | 2730.67 | 44.30 | 0.00 | 0.00 | true | 0.442538;0.441103;0.442020;0.443321;0.443201 | 0;0;0;0;0 | 737536;737280;737280;737280;737280 | 2816;2816;2432;2560;3200 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 129 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1187840 | 0.00 | 80138.67 | 11.10 | 14.82 | 237.57 | true | 0.110742;0.110544;0.110605;0.110673;0.110290 | 1187840;1187840;1187840;1187840;1187840 | 0;0;0;512;0 | 81504;81888;77280;81632;74368 |
124 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 25.333 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 31360 | 640.00 | 256.00 | 44.90 | 35.00 | 6.72 | false | 0.448562;0.448903;0.448647;0.448517;0.448490 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 256;256;256;256;256 |
125 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 20.333 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 31360 | 640.00 | 85.33 | 44.70 | 43.24 | 7.24 | false | 0.446296;0.446735;0.446715;0.447125;0.447218 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 256;0;0;0;256 |
126 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 160 14 14]] | 20.333 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 1322.67 | 43.70 | 0.00 | 0.00 | true | 0.436622;0.437110;0.436976;0.436950;0.436808 | 0;0;0;0;0 | 1152;1152;1408;1408;1536 | 1024;0;0;0;0 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 134.333 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 43.00 | 54906880 | 0.00 | 340309.33 | 12.50 | 161.34 | 1276.90 | false | 0.124878;0.124880;0.124869;0.124865;0.124875 | 54906880;54906880;54906880;54906880;54906880 | 341184;333728;337312;342432;345280 | 0;0;0;0;0 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 134.333 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 921600.00 | 23466.67 | 44.10 | 0.00 | 0.00 | true | 0.442373;0.440589;0.441325;0.442262;0.439658 | 0;0;0;0;0 | 23488;23424;23232;24192;23488 | 921600;921600;921600;921600;921600 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 134.333 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1484800 | 64.00 | 147680.00 | 13.60 | 10.05 | 296.96 | true | 0.135468;0.135927;0.135595;0.135744;0.136401 | 1484800;1484800;1484800;1484800;1484800 | 146560;154624;151328;145152;142336 | 64;64;1600;64;64 |
128 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 25.333 | 200704 | 0 | 45324800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 31360 | 640.00 | 213.33 | 44.90 | 36.75 | 6.72 | false | 0.447229;0.449043;0.448959;0.448188;0.448742 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 384;128;128;256;256 |
130 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 23 | 451584 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 23754.67 | 58.30 | 0.00 | 0.00 | true | 0.583004;0.583533;0.582702;0.584131;0.583571 | 0;0;0;0;0 | 23840;24096;23456;23968;23456 | 0;0;0;0;0 |
131 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 40.333 | 727552 | 727552 | 45901824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.33 | 2417538 | 0.00 | 326517.33 | 42.70 | 7.40 | 259.03 | true | 0.427836;0.428528;0.424112;0.426920;0.424968 | 2417538;2417538;2417538;2417538;2417538 | 0;0;0;0;0 | 326400;326624;327008;326528;326400 |
132 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 120.667 | 125440 | 494080 | 46027264 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 41319040 | 0.00 | 320533.33 | 3.10 | 128.91 | 1137.23 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 41319040;41319040;41319040;41319040;41319040 | 0;0;0;0;0 | 322624;299584;315456;323520;326848 |
132 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 120.667 | 125440 | 494080 | 46027264 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 368640.00 | 75029.33 | 45.50 | 0.00 | 0.00 | true | 0.454882;0.454753;0.452882;0.455265;0.454244 | 0;0;0;0;0 | 72224;83232;79488;73376;71072 | 368640;368640;370944;368640;368640 |
133 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114.667 | 100352 | 395264 | 46127616 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.33 | 33055232 | 0.00 | 23274.67 | 3.10 | 1420.22 | 909.79 | false | 0.031247;0.031248;0.031248;0.031248;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;0;0;0 | 23232;23360;23360;23232;23232 |
133 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114.667 | 100352 | 395264 | 46127616 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 0.00 | 45.10 | 0.00 | 0.00 | true | 0.451367;0.452483;0.450829;0.449775;0.451920 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 0;0;0;128;0 |
134 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 118 | 75264 | 296448 | 46202880 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.00 | 24791424 | 0.00 | 1002.67 | 3.10 | 24725.48 | 688.65 | false | 0.031247;0.031248;0.031247;0.031248;0.031248 | 24791424;24791424;24791424;24791424;24791424 | 1088;960;960;960;1088 | 0;0;0;0;0 |
134 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 118 | 75264 | 296448 | 46202880 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 896.00 | 44.50 | 0.00 | 0.00 | true | 0.445023;0.445731;0.444157;0.445467;0.444325 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 896;1024;896;896;896 |
135 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 112.333 | 75264 | 451584 | 45826560 | GPU_0_bfc | 376320 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.00 | 24791424 | 0.00 | 288.00 | 3.10 | 86081.33 | 688.65 | false | 0.031247;0.031248;0.031247;0.031248;0.031248 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 288;288;416;288;288 |
135 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 112.333 | 75264 | 451584 | 45826560 | GPU_0_bfc | 376320 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 44.50 | 0.00 | 0.00 | true | 0.445336;0.445678;0.445900;0.445054;0.445042 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
136 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 28.333 | 125440 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 31360 | 640.00 | 170.67 | 45.10 | 38.68 | 6.72 | false | 0.451503;0.451778;0.451094;0.450989;0.451197 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 256;256;128;128;128 |
137 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 21.333 | 100352 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 25088 | 512.00 | 0.00 | 43.60 | 49.00 | 5.79 | false | 0.434542;0.435397;0.436269;0.435726;0.436994 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;0;128 |
138 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 20 | 75264 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 18816 | 384.00 | 0.00 | 44.30 | 49.00 | 3.76 | false | 0.443548;0.443455;0.441759;0.442616;0.442097 | 18816;18816;18816;18816;18816 | 0;0;0;128;0 | 384;384;384;384;14208 |
139 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 20.333 | 75264 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 18816 | 384.00 | 0.00 | 44.30 | 49.00 | 4.34 | false | 0.443087;0.442975;0.442807;0.443457;0.440261 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;128;0;0 |
140 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 14 14]] | 19.333 | 125440 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 42.67 | 43.90 | 0.00 | 0.00 | true | 0.438379;0.438832;0.438538;0.438660;0.438528 | 0;0;0;0;0 | 0;0;0;5120;0 | 128;128;0;0;0 |
141 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 18.333 | 100352 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 43.20 | 0.00 | 0.00 | true | 0.432171;0.432133;0.432065;0.432088;0.432139 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142.333 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 43.00 | 65888256 | 0.00 | 555402.67 | 12.50 | 118.63 | 1532.29 | false | 0.124882;0.124887;0.124882;0.124884;0.124885 | 65888256;65888256;65888256;65888256;65888256 | 0;0;256;0;0 | 557216;557248;552288;555808;553184 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142.333 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1105920.00 | 639840.00 | 44.90 | 0.00 | 0.00 | true | 0.448995;0.446987;0.451271;0.449651;0.449477 | 0;0;0;0;0 | 1105920;1105920;1112576;1105920;1105920 | 643648;640160;641280;638080;634688 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142.333 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1781760 | 0.00 | 1232064.00 | 15.00 | 1.45 | 356.35 | true | 0.149785;0.149902;0.149445;0.149569;0.149876 | 1781760;1781760;1781760;1781760;1781760 | 0;0;0;0;0 | 1224960;1228192;1256480;1231072;1236928 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 52813824 | 5002.67 | 251776.00 | 12.50 | 205.68 | 1508.97 | false | 0.124844;0.124836;0.124834;0.124841;0.124841 | 52813824;52813824;52813824;52813824;52813824 | 16384;2688;2560;9760;2560 | 254624;250592;247776;253024;251712 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 884736.00 | 428576.00 | 44.40 | 0.00 | 0.00 | true | 0.443902;0.443962;0.445097;0.446121;0.443989 | 0;0;0;0;0 | 884736;884736;884736;884736;884736 | 432736;423392;400576;435744;429600 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1425408 | 0.00 | 595893.33 | 13.10 | 2.39 | 285.08 | true | 0.131113;0.131152;0.131005;0.131056;0.130842 | 1425408;1425408;1425408;1425408;1425408 | 588256;605984;595840;603584;581088 | 0;0;0;0;0 |
144 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 26.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 426.67 | 44.50 | 31.50 | 9.41 | false | 0.444827;0.444850;0.444952;0.444045;0.444799 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 512;384;384;512;256 |
145 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 20.667 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 9173.33 | 44.10 | 3.79 | 9.41 | true | 0.441286;0.441305;0.440905;0.441256;0.441156 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 10240;9728;8576;8832;8960 |
146 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 192 14 14]] | 20 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 9472.00 | 43.80 | 0.00 | 0.00 | true | 0.438008;0.437491;0.437195;0.437822;0.437438 | 0;0;0;0;0 | 10880;8448;8448;9728;10240 | 0;0;0;0;0 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 152 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 78962688 | 85.33 | 528981.33 | 12.50 | 149.25 | 1579.25 | false | 0.124896;0.124892;0.124887;0.124895;0.124887 | 78962688;78962688;78962688;78962688;78962688 | 0;5120;0;256;0 | 529824;528704;529152;516544;529088 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 152 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1327168.00 | 73216.00 | 44.00 | 0.00 | 0.00 | true | 0.441352;0.439119;0.438517;0.439879;0.439713 | 0;0;0;0;0 | 1327168;1327168;1327168;1327168;1327168 | 84864;79040;65536;72544;68064 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 152 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2138112 | 0.00 | 1011477.33 | 17.40 | 2.11 | 356.35 | true | 0.174960;0.174257;0.173806;0.173557;0.174730 | 2138112;2138112;2138112;2138112;2138112 | 0;0;0;0;0 | 1009280;1012160;1010976;1012896;1011296 |
148 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 26.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 37632 | 768.00 | 10154.67 | 44.40 | 3.45 | 8.68 | true | 0.444081;0.444464;0.443791;0.444138;0.443185 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 9728;9600;10624;10112;10624 |
150 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 23 | 727552 | 0 | 45450240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 256789.33 | 58.40 | 0.00 | 0.00 | true | 0.583821;0.583968;0.584139;0.584620;0.583361 | 0;0;0;0;0 | 256800;230464;257024;257120;256544 | 0;2048;0;0;0 |
151 | InceptionV2/InceptionV2/Mixed_5a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 576 7 7]] | 39.333 | 112896 | 112896 | 45563136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 28224 | 597.33 | 113216.00 | 13.70 | 0.25 | 5.64 | true | 0.136617;0.136890;0.136592;0.136785;0.136665 | 28224;28224;28224;28224;28224 | 113312;113056;113184;113280;113184 | 256;512;512;768;768 |
152 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 118.333 | 150528 | 592896 | 45713664 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 49582848 | 0.00 | 320896.00 | 3.10 | 154.51 | 1340.08 | false | 0.031248;0.031248;0.031248;0.031248;0.031247 | 49582848;49582848;49582848;49582848;49582848 | 0;0;0;512;0 | 326176;325248;325152;312288;304608 |
152 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 118.333 | 150528 | 592896 | 45713664 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 442368.00 | 120053.33 | 45.90 | 0.00 | 0.00 | true | 0.459765;0.459531;0.458043;0.459415;0.459062 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 114240;115584;115904;128672;132992 |
153 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 119.333 | 100352 | 395264 | 45814016 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 33055232 | 0.00 | 330.67 | 3.10 | 99965.32 | 893.38 | false | 0.031247;0.031248;0.031248;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 416;416;288;288;288 | 0;0;0;0;0 |
153 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 119.333 | 100352 | 395264 | 45814016 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 294912.00 | 0.00 | 45.30 | 0.00 | 0.00 | true | 0.453953;0.454176;0.452128;0.451727;0.451147 | 0;0;0;0;0 | 0;0;128;0;0 | 294912;294912;294912;294912;296960 |
154 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 28.333 | 150528 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 37632 | 768.00 | 128.00 | 44.70 | 42.00 | 8.68 | false | 0.446625;0.447079;0.446049;0.447194;0.446955 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 128;128;128;128;256 |
155 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 20.667 | 100352 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 25088 | 512.00 | 0.00 | 43.60 | 49.00 | 5.38 | false | 0.435372;0.435378;0.436328;0.436570;0.433906 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;128;0 |
156 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 14 14]] | 20 | 150528 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.437475;0.437198;0.437309;0.437061;0.438130 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;128;0;0 |
157 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 19 | 100352 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 42.67 | 43.50 | 0.00 | 0.00 | true | 0.434923;0.434226;0.434008;0.434656;0.434724 | 0;0;0;0;0 | 128;128;0;0;0 | 0;0;0;0;12032 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 155 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 51.00 | 105283584 | 147808.00 | 869056.00 | 12.50 | 103.54 | 2064.38 | false | 0.124897;0.124901;0.124892;0.124897;0.124901 | 105283584;105283584;105283584;105283584;105283584 | 147936;147424;146208;148064;148320 | 908736;912320;911872;766912;786560 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 155 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.00 | 0 | 1769472.00 | 1357717.33 | 45.30 | 0.00 | 0.00 | true | 0.452937;0.450403;0.453849;0.455009;0.451782 | 0;0;0;0;0 | 1344608;1366848;1343936;1361696;1368768 | 1769472;1769472;1769472;1769472;1769472 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 155 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.67 | 2850816 | 85.33 | 2515989.33 | 22.40 | 1.13 | 427.60 | true | 0.223614;0.224709;0.225127;0.224286;0.223769 | 2850816;2850816;2850816;2850816;2850816 | 64;128;128;64;64 | 2478368;2453536;2484448;2602368;2585152 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 164.667 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 69.00 | 28320960 | 192.00 | 140298.67 | 3.10 | 201.59 | 410.45 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 28320960;28320960;28320960;28320960;28320960 | 192;192;192;192;192 | 147680;148992;136032;137184;134560 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 164.667 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 884736.00 | 886944.00 | 44.90 | 0.00 | 0.00 | true | 0.447516;0.447862;0.449408;0.450988;0.454395 | 0;0;0;0;0 | 884736;884736;884736;884736;884736 | 888352;883872;883232;888608;900640 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 164.667 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 5.00 | 0 | 100437.33 | 13813.33 | 46.60 | 0.00 | 0.00 | true | 0.467286;0.467972;0.465448;0.464758;0.464909 | 0;0;0;0;0 | 13344;13216;12960;16160;14880 | 100608;100224;100352;100608;100352 |
160 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 14 14]] | 28.667 | 200704 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 50176 | 1024.00 | 469.33 | 45.00 | 33.60 | 11.58 | false | 0.449843;0.449775;0.450706;0.449149;0.449677 | 50176;50176;50176;50176;50176 | 2816;1024;1024;1024;1024 | 512;512;512;384;384 |
161 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 20.333 | 37632 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 9408 | 768.00 | 42.67 | 42.20 | 11.61 | 2.17 | true | 0.422015;0.420560;0.422173;0.418747;0.422209 | 9408;9408;9408;9408;9408 | 768;768;768;3328;768 | 128;128;0;0;0 |
162 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 256 14 14]] | 20.333 | 200704 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438574;0.438592;0.438298;0.438159;0.437844 | 0;0;0;0;0 | 256;0;0;0;0 | 0;0;0;0;128 |
163 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 7 7]] | 18.667 | 37632 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 42.80 | 0.00 | 0.00 | true | 0.428461;0.428798;0.428157;0.428049;0.428135 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 225.333 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 131.33 | 75510016 | 0.00 | 59573.33 | 3.10 | 1267.51 | 574.95 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75510016;75510016;75510016;75510016;75510016 | 61280;60128;60896;57312;57696 | 0;0;0;6656;0 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 225.333 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 18.00 | 0 | 2359296.00 | 1617930.67 | 44.60 | 0.00 | 0.00 | true | 0.446946;0.445636;0.446134;0.443916;0.450309 | 0;0;0;0;0 | 1615680;1614816;1615680;1622432;1622784 | 2359296;2361344;2359296;2359296;2359296 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 225.333 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4.67 | 0 | 0.00 | 1450.67 | 47.70 | 0.00 | 0.00 | true | 0.476524;0.476427;0.476721;0.476496;0.476487 | 0;0;0;0;0 | 0;0;0;0;0 | 1408;3328;1536;1408;1408 |
165 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 7 7]] | 28.333 | 50176 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 12544 | 1024.00 | 128.00 | 42.70 | 10.89 | 2.89 | true | 0.428741;0.423794;0.429174;0.428208;0.422775 | 12544;12544;12544;12544;12544 | 1536;1024;1024;1024;1024 | 128;128;128;256;128 |
166 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 256 7 7]] | 20 | 50176 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 512.00 | 43.10 | 0.00 | 0.00 | true | 0.431067;0.431235;0.431154;0.431363;0.431099 | 0;0;0;0;0 | 512;512;640;512;512 | 0;0;0;1024;0 |
168 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 7 7]] | 40.667 | 200704 | 200704 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 1087642 | 341.33 | 57397.33 | 21.70 | 18.84 | 155.38 | false | 0.218788;0.216420;0.217279;0.215815;0.217187 | 1087642;1087642;1087642;1087642;1087642 | 1024;5120;0;0;0 | 57184;57824;58080;56416;57184 |
169 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 7 7]] | 140.333 | 31488 | 686848 | 45155584 | GPU_0_bfc | 655360 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.00 | 20979360 | 0.00 | 181557.33 | 3.10 | 115.55 | 343.92 | false | 0.031249;0.031248;0.031249;0.031248;0.031249 | 20979360;20979360;20979360;20979360;20979360 | 180992;175808;181216;182464;183392 | 0;0;0;0;0 |
169 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 7 7]] | 140.333 | 31488 | 686848 | 45155584 | GPU_0_bfc | 655360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.33 | 0 | 655360.00 | 244650.67 | 43.60 | 0.00 | 0.00 | true | 0.428988;0.437358;0.439127;0.435947;0.433259 | 0;0;0;0;0 | 655360;655360;657408;655360;655360 | 247552;248416;243840;241408;242560 |
170 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 136.667 | 37632 | 824064 | 45193216 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.00 | 25175232 | 0.00 | 17130.67 | 3.10 | 1469.60 | 412.71 | false | 0.031249;0.031249;0.031248;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 0;0;0;0;0 | 17344;16576;19136;16704;17344 |
170 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 136.667 | 37632 | 824064 | 45193216 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 786432.00 | 20597.33 | 43.70 | 0.00 | 0.00 | true | 0.441463;0.442910;0.434195;0.434815;0.435791 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 18848;23680;18592;21152;21792 |
171 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 141.667 | 69120 | 1510912 | 45262336 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.33 | 46154592 | 0.00 | 52106.67 | 3.10 | 885.77 | 752.52 | false | 0.031248;0.031249;0.031249;0.031249;0.031249 | 46154592;46154592;46154592;46154592;46154592 | 0;0;0;0;0 | 53984;52576;55488;49504;49760 |
171 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 141.667 | 69120 | 1510912 | 45262336 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1441792.00 | 174517.33 | 42.90 | 0.00 | 0.00 | true | 0.429837;0.428688;0.428818;0.428238;0.426259 | 0;0;0;0;0 | 174848;172864;169344;175840;177568 | 1441792;1441792;1441792;1441792;1441792 |
172 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 137 | 25088 | 549376 | 45086720 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.33 | 16783488 | 0.00 | 832.00 | 3.10 | 20172.46 | 273.65 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 16783488;16783488;16783488;16783488;16783488 | 0;0;0;0;0 | 832;832;960;832;704 |
172 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 137 | 25088 | 549376 | 45086720 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.67 | 0 | 524288.00 | 307456.00 | 43.60 | 0.00 | 0.00 | true | 0.433834;0.437694;0.435998;0.435303;0.435604 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 302208;306304;307136;308928;313024 |
173 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 7 7]] | 27.333 | 31488 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 7840 | 682.67 | 0.00 | 45.90 | 11.48 | 1.57 | true | 0.457109;0.456042;0.462888;0.462656;0.456210 | 7840;7840;7840;7840;7840 | 0;128;0;0;0 | 768;640;640;768;640 |
174 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 22 | 37632 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 9408 | 768.00 | 768.00 | 42.10 | 6.12 | 1.88 | true | 0.421050;0.423010;0.421184;0.420098;0.417031 | 9408;9408;9408;9408;9408 | 768;768;768;768;896 | 768;768;768;768;14592 |
175 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 352 7 7]] | 21.667 | 69120 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 17248 | 1408.00 | 810.67 | 43.40 | 7.77 | 3.70 | true | 0.434044;0.436710;0.435025;0.433665;0.432882 | 17248;17248;17248;17248;17248 | 1408;1408;1408;1408;1408 | 1792;0;512;128;5888 |
176 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 7 7]] | 19.667 | 25088 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 6272 | 512.00 | 426.67 | 41.60 | 6.68 | 1.34 | true | 0.416002;0.417643;0.417087;0.415548;0.414291 | 6272;6272;6272;6272;6272 | 384;384;512;384;512 | 512;512;512;512;512 |
177 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 7 7]] | 20 | 31488 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 42.67 | 44.10 | 0.00 | 0.00 | true | 0.440970;0.440775;0.440475;0.440206;0.440850 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;0;512 |
178 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 18.333 | 37632 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 42.90 | 0.00 | 0.00 | true | 0.428988;0.428895;0.429106;0.429808;0.428931 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 145.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 43.00 | 38434816 | 0.00 | 438656.00 | 12.50 | 87.62 | 893.83 | false | 0.124916;0.124915;0.124909;0.124913;0.124917 | 38434816;38434816;38434816;38434816;38434816 | 0;0;1024;0;0 | 439456;439104;427808;438176;438688 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 145.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1290240.00 | 435125.33 | 44.30 | 0.00 | 0.00 | true | 0.443313;0.452318;0.445199;0.441438;0.441229 | 0;0;0;0;0 | 1290240;1290240;1296896;1290240;1290240 | 436512;436992;435200;433664;424832 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 145.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2078720 | 0.00 | 958560.00 | 17.60 | 2.17 | 346.45 | true | 0.176007;0.175423;0.176081;0.176479;0.175642 | 2078720;2078720;2078720;2078720;2078720 | 956768;961248;959328;957888;958464 | 0;0;0;0;0 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 153 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.33 | 65802240 | 37856.00 | 690666.67 | 12.50 | 90.32 | 1307.34 | false | 0.124932;0.124931;0.124931;0.124934;0.124932 | 65802240;65802240;65802240;65802240;65802240 | 719328;745664;755104;607008;601632 | 37856;37856;37856;38624;37856 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 153 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.67 | 0 | 2211904.00 | 1961280.00 | 46.50 | 0.00 | 0.00 | true | 0.467578;0.466015;0.464332;0.464725;0.465628 | 0;0;0;0;0 | 2211904;2211904;2211904;2211904;2211904 | 1962912;1965664;1954400;1958080;1962848 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 153 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.67 | 3563520 | 15168.00 | 3143701.33 | 25.00 | 1.13 | 464.79 | true | 0.248120;0.245739;0.250923;0.251324;0.251623 | 3563520;3563520;3563520;3563520;3563520 | 15680;11072;11072;18752;20160 | 3122496;3095040;3099616;3230656;3208992 |
181 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 27.667 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 10976 | 42666.67 | 725.33 | 45.20 | 0.25 | 2.20 | true | 0.451405;0.452135;0.453879;0.448981;0.451284 | 10976;10976;10976;10976;10976 | 42752;43136;42752;42496;42496 | 768;704;704;960;576 |
182 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 7 7]] | 21 | 62720 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 15680 | 1280.00 | 8576.00 | 45.20 | 1.59 | 3.14 | true | 0.452080;0.453229;0.452218;0.451118;0.446907 | 15680;15680;15680;15680;15680 | 1280;1280;1280;1280;1280 | 8576;8576;8576;8576;8704 |
183 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 224 7 7]] | 19.333 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 35168.00 | 43.50 | 0.00 | 0.00 | true | 0.435321;0.435386;0.435103;0.434926;0.435106 | 0;0;0;0;0 | 0;0;0;0;0 | 35168;35296;35168;35040;35168 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 57.33 | 53688320 | 42133.33 | 718826.67 | 12.50 | 70.55 | 936.43 | false | 0.124939;0.124938;0.124937;0.124940;0.124940 | 53688320;53688320;53688320;53688320;53688320 | 41920;41024;41280;43456;43200 | 792544;775168;770720;610592;605792 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 1806336.00 | 1707488.00 | 45.70 | 0.00 | 0.00 | true | 0.456189;0.456157;0.458217;0.461818;0.454242 | 0;0;0;0;0 | 1806336;1806336;1811456;1806336;1806336 | 1695744;1688928;1711008;1716384;1715712 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.00 | 2910208 | 42.67 | 2620800.00 | 22.00 | 1.11 | 415.74 | true | 0.219844;0.220317;0.219821;0.219891;0.219139 | 2910208;2910208;2910208;2910208;2910208 | 64;64;0;5632;0 | 2560320;2582176;2564960;2715264;2726304 |
185 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 25.667 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 10976 | 1024.00 | 512.00 | 45.10 | 7.15 | 2.20 | true | 0.453294;0.450835;0.450249;0.448505;0.450648 | 10976;10976;10976;10976;10976 | 512;512;512;640;512 | 1024;1024;1024;1024;1024 |
187 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 7 7]] | 22.667 | 294656 | 0 | 45017344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 170922.67 | 43.90 | 0.00 | 0.00 | true | 0.438193;0.438696;0.439337;0.438970;0.438276 | 0;0;0;0;0 | 170752;171008;178816;171008;170624 | 0;13312;0;0;0 |
188 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[1 1024 7 7]] | 37.333 | 200704 | 200704 | 45218048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 50176 | 512.00 | 200789.33 | 23.10 | 0.25 | 10.04 | true | 0.230377;0.229918;0.231648;0.231177;0.231392 | 50176;50176;50176;50176;50176 | 256;512;512;512;512 | 200960;200832;200832;200704;200448 |
189 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 141.667 | 37632 | 824064 | 45255680 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.33 | 25175232 | 0.00 | 275850.67 | 3.10 | 91.26 | 410.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 0;0;0;0;0 | 278752;279136;277600;271200;264544 |
189 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 141.667 | 37632 | 824064 | 45255680 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 786432.00 | 532629.33 | 43.80 | 0.00 | 0.00 | true | 0.438357;0.438264;0.438586;0.434161;0.441137 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 531168;528736;532480;534240;535424 |
190 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 135.333 | 37632 | 824064 | 45293312 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.00 | 25175232 | 0.00 | 37450.67 | 3.10 | 672.22 | 412.71 | false | 0.031248;0.031249;0.031249;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 0;0;0;0;0 | 37536;37920;37408;37408;37408 |
190 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 135.333 | 37632 | 824064 | 45293312 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.67 | 0 | 786432.00 | 1237.33 | 44.30 | 0.00 | 0.00 | true | 0.437671;0.439985;0.447600;0.441472;0.450441 | 0;0;0;0;0 | 896;512;1280;1536;1536 | 786432;786432;786688;786432;786432 |
191 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 141.333 | 100608 | 1542400 | 45393920 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 62.00 | 46154592 | 0.00 | 215712.00 | 3.10 | 213.96 | 744.43 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 46154592;46154592;46154592;46154592;46154592 | 0;0;0;0;0 | 228640;226976;202560;209184;210976 |
191 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 141.333 | 100608 | 1542400 | 45393920 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1441792.00 | 564704.00 | 42.60 | 0.00 | 0.00 | true | 0.419698;0.426436;0.423186;0.430920;0.429673 | 0;0;0;0;0 | 1441792;1441792;1441792;1441792;1441792 | 545440;548320;578080;572768;573024 |
192 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 137 | 25088 | 549376 | 45124352 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.67 | 16783488 | 0.00 | 0.00 | 3.10 | 0.00 | 272.16 | true | 0.031248;0.031249;0.031249;0.031249;0.031248 | 16783488;16783488;16783488;16783488;16783488 | 0;0;256;0;0 | 10496;0;0;0;0 |
192 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 137 | 25088 | 549376 | 45124352 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.67 | 0 | 524288.00 | 308053.33 | 43.20 | 0.00 | 0.00 | true | 0.431434;0.437892;0.433747;0.431149;0.431439 | 0;0;0;0;0 | 305792;308864;305920;310144;309376 | 524288;524288;524288;524288;524288 |
193 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 29 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 9408 | 1024.00 | 128.00 | 43.70 | 8.17 | 1.88 | true | 0.436474;0.436985;0.437078;0.435229;0.441189 | 9408;9408;9408;9408;9408 | 1024;1024;1024;1024;1024 | 128;128;128;0;384 |
194 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 20.667 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 9408 | 768.00 | 426.67 | 42.00 | 7.87 | 1.88 | true | 0.419647;0.418701;0.418627;0.420525;0.423043 | 9408;9408;9408;9408;9408 | 5632;768;768;768;768 | 384;256;384;512;512 |
195 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 352 7 7]] | 21.333 | 100608 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 17248 | 1408.00 | 1546.67 | 43.50 | 5.84 | 3.45 | true | 0.434633;0.435896;0.434553;0.435426;0.434517 | 17248;17248;17248;17248;17248 | 3456;1408;1408;1408;1408 | 1664;1440;1664;1536;1408 |
196 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 7 7]] | 20.333 | 25088 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 6272 | 512.00 | 256.00 | 41.60 | 8.17 | 1.34 | true | 0.417028;0.413694;0.415339;0.416420;0.416149 | 6272;6272;6272;6272;6272 | 128;0;512;1152;128 | 512;512;512;1024;512 |
197 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 19.667 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 43.10 | 0.00 | 0.00 | true | 0.431469;0.430664;0.431198;0.431204;0.431067 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
198 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 20.333 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 42.80 | 0.00 | 0.00 | true | 0.429055;0.428679;0.428333;0.428480;0.427755 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 46061568 | 1237.33 | 451072.00 | 12.50 | 101.84 | 921.23 | false | 0.124932;0.124932;0.124931;0.124934;0.124931 | 46061568;46061568;46061568;46061568;46061568 | 640;896;2176;3328;256 | 439936;460544;451584;449152;452480 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1548288.00 | 381493.33 | 44.20 | 0.00 | 0.00 | true | 0.441756;0.444909;0.445117;0.437849;0.440231 | 0;0;0;0;0 | 376704;387712;394688;374944;380064 | 1548288;1548288;1548288;1548288;1548288 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2494464 | 64.00 | 1375349.33 | 19.90 | 1.81 | 415.74 | true | 0.198393;0.199066;0.198553;0.198417;0.200180 | 2494464;2494464;2494464;2494464;2494464 | 5440;64;64;64;64 | 1376032;1400640;1360896;1358176;1389120 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 158.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.33 | 65802240 | 37856.00 | 744138.67 | 12.50 | 84.15 | 1307.34 | false | 0.124929;0.124931;0.124933;0.124928;0.124932 | 65802240;65802240;65802240;65802240;65802240 | 37856;37856;37856;37856;37856 | 868064;826016;688064;696416;709984 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 158.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.00 | 0 | 2211840.00 | 2117696.00 | 46.80 | 0.00 | 0.00 | true | 0.465424;0.470282;0.465419;0.469987;0.468683 | 0;0;0;0;0 | 2096608;2106624;2130784;2136896;2115680 | 2211840;2211840;2211840;2211840;2211840 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 158.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 8.00 | 3563520 | 14741.33 | 3267200.00 | 25.00 | 1.09 | 445.44 | true | 0.249898;0.251767;0.249059;0.247387;0.252424 | 3563520;3563520;3563520;3563520;3563520 | 12544;11904;15616;20096;16064 | 3158112;3180256;3327360;3326048;3295296 |
201 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 27.333 | 44032 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.67 | 10976 | 10261.33 | 544.00 | 45.10 | 1.02 | 1.94 | true | 0.451887;0.451236;0.452471;0.450703;0.449594 | 10976;10976;10976;10976;10976 | 544;544;544;544;672 | 13632;11712;6080;8896;10176 |
202 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 7 7]] | 21 | 62720 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 15680 | 1280.00 | 2048.00 | 45.20 | 4.71 | 3.14 | true | 0.452018;0.451063;0.452232;0.453141;0.452077 | 15680;15680;15680;15680;15680 | 1280;1280;1280;1280;1280 | 2048;2048;2048;2176;1920 |
203 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 224 7 7]] | 20 | 44032 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 41728.00 | 43.50 | 0.00 | 0.00 | true | 0.434981;0.434937;0.434370;0.434213;0.434044 | 0;0;0;0;0 | 41728;41728;41856;41728;41728 | 0;0;0;0;0 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 57.00 | 53688320 | 42560.00 | 644480.00 | 12.50 | 78.14 | 941.90 | false | 0.124941;0.124941;0.124941;0.124940;0.124942 | 53688320;53688320;53688320;53688320;53688320 | 778176;707392;599296;614048;612000 | 41408;40000;43072;43200;43840 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.33 | 0 | 1807018.67 | 1711840.00 | 45.70 | 0.00 | 0.00 | true | 0.455344;0.459921;0.455605;0.459290;0.457205 | 0;0;0;0;0 | 1806336;1812992;1808384;1806336;1806336 | 1690080;1705120;1712928;1726592;1717472 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 155 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.67 | 2910208 | 0.00 | 2673472.00 | 21.90 | 1.09 | 436.51 | true | 0.218798;0.219027;0.219050;0.218362;0.218679 | 2910208;2910208;2910208;2910208;2910208 | 2582336;2593344;2733440;2708288;2718784 | 0;0;64;0;0 |
205 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 28.333 | 75264 | 0 | 44986368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 10976 | 1024.00 | 512.00 | 45.00 | 7.15 | 2.20 | true | 0.449373;0.450959;0.450322;0.449964;0.450057 | 10976;10976;10976;10976;10976 | 1024;1024;1024;1024;1024 | 512;512;640;512;512 |
207 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 7 7]] | 22.667 | 200704 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 177792.00 | 43.90 | 0.00 | 0.00 | true | 0.438537;0.438615;0.438423;0.439069;0.438279 | 0;0;0;0;0 | 179456;177280;179648;176640;175616 | 0;0;0;0;0 |
208 | InceptionV2/Logits/AvgPool_1a_7x7/AvgPool | AvgPool | [[1 1024 1 1]] | 42.667 | 4096 | 4096 | 44927488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 72074 | 256.00 | 384.00 | 10.70 | 112.62 | 9.01 | false | 0.106768;0.106802;0.106834;0.106803;0.106831 | 72074;72074;72074;72074;72074 | 256;256;256;256;256 | 384;384;384;384;512 |
209 | InceptionV2/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 168.667 | 4096 | 4104192 | 44730880 | GPU_0_bfc | 4100096 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 67109865 | 1340672.00 | 457792.00 | 3.10 | 37.32 | 1001.64 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 67109865;67109865;67109865;67109865;67109865 | 1299840;1283456;1364608;1359872;1362304 | 451328;442112;458688;467328;463360 |
209 | InceptionV2/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 168.667 | 4096 | 4104192 | 44730880 | GPU_0_bfc | 4100096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 28.00 | 0 | 4157269.33 | 3774538.67 | 46.20 | 0.00 | 0.00 | true | 0.464139;0.459190;0.465736;0.460299;0.461314 | 0;0;0;0;0 | 4158272;4159232;4154688;4153920;4158848 | 3756576;3759136;3783072;3785568;3781408 |
210 | InceptionV2/Logits/Conv2d_1c_1x1/BiasAdd | BiasAdd | [[1 1001 1 1]] | 26.667 | 4096 | 0 | 44726784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 4.00 | 1001 | 5024.00 | 341.33 | 45.60 | 0.19 | 0.25 | true | 0.473804;0.446467;0.431998;0.458310;0.462550 | 1001;1001;1001;1001;1001 | 5024;5024;5024;5024;5024 | 384;384;256;256;384 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 62.667 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 8.00 | 10431 | 3072.00 | 213.33 | 2.30 | 3.18 | 1.30 | true | 0.023218;0.023083;0.023359;0.023370;0.023360 | 10431;10431;10431;10431;10431 | 256;256;256;128;128 | 2048;3072;3072;3072;3072 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 62.667 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 4.00 | 0 | 1792.00 | 0.00 | 4.50 | 0.00 | 0.00 | true | 0.045205;0.043199;0.045460;0.043097;0.046474 | 0;0;0;0;0 | 2304;1792;1792;1792;1792 | 0;0;0;0;0 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 62.667 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 4.00 | 24024 | 512.00 | 0.00 | 6.20 | 46.92 | 6.01 | false | 0.062140;0.062145;0.062135;0.062146;0.062145 | 24024;24024;24024;24024;24024 | 512;512;512;512;512 | 0;128;0;0;0 |
Showing 1 to 309 of 309 entries