GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[1 3 224 224]] | 89.333 | 602112 | 602112 | 45926912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 6.00 | 0 | 3157.33 | 468170.67 | 59.00 | 0.00 | 0.00 | true | 0.594552;0.584173;0.590609;0.589807;0.589070 | 0;0;0;0;0 | 6912;3072;3072;3072;3328 | 451552;470400;496544;462944;471168 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 304.667 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 45058048 | 256.00 | 388871.11 | 12.00 | 115.79 | 3218.43 | false | 0.120699;0.119447;0.119860;0.122753;0.120823;0.119340;0.119929;0.119371;0.120934;0.120555;0.120444;0.120260;0.120553;0.119699;0.119135 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 256;256;256;512;768;256;512;256;256;256;256;256;256;256;256 | 433920;381088;367808;434976;384544;366656;434144;382816;367328;432992;378848;367136;431072;373216;367456 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 304.667 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 45058048 | 256.00 | 388871.11 | 12.00 | 115.79 | 3218.43 | false | 0.120699;0.119447;0.119860;0.122753;0.120823;0.119340;0.119929;0.119371;0.120934;0.120555;0.120444;0.120260;0.120553;0.119699;0.119135 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 256;256;256;512;768;256;512;256;256;256;256;256;256;256;256 | 433920;381088;367808;434976;384544;366656;434144;382816;367328;432992;378848;367136;431072;373216;367456 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 304.667 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 45058048 | 256.00 | 388871.11 | 12.00 | 115.79 | 3218.43 | false | 0.120699;0.119447;0.119860;0.122753;0.120823;0.119340;0.119929;0.119371;0.120934;0.120555;0.120444;0.120260;0.120553;0.119699;0.119135 | 45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048;45058048 | 256;256;256;512;768;256;512;256;256;256;256;256;256;256;256 | 433920;381088;367808;434976;384544;366656;434144;382816;367328;432992;378848;367136;431072;373216;367456 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 304.667 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 5.00 | 0 | 1109.33 | 368789.33 | 47.20 | 0.00 | 0.00 | true | 0.471365;0.472362;0.472451;0.473118;0.471295 | 0;0;0;0;0 | 768;768;1280;1280;1920 | 389056;375136;350048;372832;358400 |
2 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise | DepthwiseConv2dNative | [[1 24 112 112]] | 304.667 | 1204224 | 1816832 | 46529024 | GPU_0_bfc | 612608 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 4736.00 | 128.00 | 36.20 | 0.00 | 0.00 | true | 0.374927;0.362002;0.335709;0.362096;0.362386 | 0;0;0;0;0 | 4736;4736;4736;4736;4736 | 0;128;256;2560;0 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 143.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 17.33 | 40140800 | 11008.00 | 2279082.67 | 7.80 | 17.53 | 2315.86 | true | 0.077843;0.077736;0.077804;0.077689;0.077835 | 40140800;40140800;40140800;40140800;40140800 | 2272992;2268896;2284320;2279936;2291200 | 7424;10752;11008;11520;11264 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 143.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 6314.67 | 42.67 | 36.40 | 0.00 | 0.00 | true | 0.363705;0.363660;0.363767;0.363722;0.363473 | 0;0;0;0;0 | 0;128;128;0;0 | 6912;6144;6144;6144;6656 |
3 | InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d | Conv2D | [[1 64 112 112]] | 143.333 | 3211264 | 3292928 | 49138176 | GPU_0_bfc | 81664 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 938.67 | 8778.67 | 7.30 | 0.00 | 0.00 | true | 0.073316;0.073350;0.073340;0.073338;0.073419 | 0;0;0;0;0 | 1024;1408;1024;768;512 | 7328;11808;4000;9888;9120 |
4 | InceptionV2/InceptionV2/Conv2d_1a_7x7/BiasAdd | BiasAdd | [[1 64 112 112]] | 34 | 3211264 | 0 | 47933952 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 8.33 | 802816 | 384.00 | 1475008.00 | 47.40 | 0.54 | 96.34 | true | 0.474464;0.472888;0.473830;0.473544;0.476089 | 802816;802816;802816;802816;802816 | 384;384;384;384;384 | 1418432;1450208;1494176;1502880;1480640 |
5 | InceptionV2/InceptionV2/Conv2d_1a_7x7/Relu | Relu | [[1 64 112 112]] | 23.667 | 3211264 | 0 | 47933952 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 256.00 | 1017237.33 | 75.20 | 0.00 | 0.00 | true | 0.743120;0.758827;0.753105;0.750785;0.753339 | 0;0;0;0;0 | 0;256;256;256;256 | 1031040;986592;1082304;1029568;991104 |
6 | InceptionV2/InceptionV2/MaxPool_2a_3x3/MaxPool | MaxPool | [[1 64 56 56]] | 49.333 | 802816 | 802816 | 48736768 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 200704 | 4842.67 | 953333.33 | 53.70 | 0.21 | 22.30 | true | 0.539094;0.541114;0.531547;0.543280;0.530400 | 200704;200704;200704;200704;200704 | 4928;4672;4928;4416;5952 | 952896;942528;955200;951904;996032 |
7 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 56 56]] | 97 | 802816 | 819200 | 46328320 | GPU_0_bfc | 16384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.00 | 25890816 | 394.67 | 823189.33 | 7.00 | 31.44 | 1991.60 | false | 0.069980;0.068912;0.069863;0.069443;0.069499 | 25890816;25890816;25890816;25890816;25890816 | 823520;823584;823040;820384;823008 | 480;224;480;224;480 |
7 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 56 56]] | 97 | 802816 | 819200 | 46328320 | GPU_0_bfc | 16384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 17152.00 | 426.67 | 40.80 | 0.00 | 0.00 | true | 0.408228;0.408167;0.408267;0.408229;0.408459 | 0;0;0;0;0 | 17152;17152;17152;17152;17152 | 384;384;512;384;512 |
8 | InceptionV2/InceptionV2/Conv2d_2b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 56 56]] | 34.667 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 200704 | 3584.00 | 426.67 | 65.10 | 50.04 | 50.18 | false | 0.661265;0.671886;0.645807;0.644788;0.625997 | 200704;200704;200704;200704;200704 | 3072;3840;3584;3584;3584 | 384;512;512;384;384 |
9 | InceptionV2/InceptionV2/Conv2d_2b_1x1/Relu | Relu | [[1 64 56 56]] | 19 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 256.00 | 67.50 | 0.00 | 0.00 | true | 0.676019;0.675572;0.674240;0.674957;0.675647 | 0;0;0;0;0 | 0;0;0;0;0 | 384;256;256;256;256 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 173.333 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 57.00 | 373309440 | 10666.67 | 2737866.67 | 21.30 | 135.82 | 6549.29 | false | 0.213548;0.210181;0.210837;0.216510;0.213747 | 373309440;373309440;373309440;373309440;373309440 | 2756896;2702624;2675008;2754080;2784032 | 12800;11520;9984;10496;9984 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 173.333 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 2229.33 | 45.90 | 0.00 | 0.00 | true | 0.459416;0.455421;0.460054;0.458973;0.459699 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 3104;864;2208;2400;2080 |
10 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 56 56]] | 173.333 | 2408448 | 4440832 | 47933952 | GPU_0_bfc | 2032384 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 712704 | 1621.33 | 170901.33 | 7.50 | 4.13 | 178.18 | true | 0.075076;0.075089;0.075110;0.075155;0.074889 | 712704;712704;712704;712704;712704 | 1536;1792;2048;1536;1536 | 150464;168992;176672;171456;172256 |
11 | InceptionV2/InceptionV2/Conv2d_2c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 56 56]] | 26.667 | 2408448 | 0 | 47131136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 602112 | 768.00 | 118656.00 | 73.20 | 5.04 | 100.35 | true | 0.728479;0.738488;0.726412;0.728770;0.742151 | 602112;602112;602112;602112;602112 | 768;768;768;768;768 | 109856;120864;91680;125248;125920 |
12 | InceptionV2/InceptionV2/Conv2d_2c_3x3/Relu | Relu | [[1 192 56 56]] | 19.667 | 2408448 | 0 | 47131136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 0.00 | 22368.00 | 64.80 | 0.00 | 0.00 | true | 0.648048;0.647616;0.646389;0.648490;0.647643 | 0;0;0;0;0 | 256;0;0;0;0 | 20224;24480;17280;28800;22400 |
13 | InceptionV2/InceptionV2/MaxPool_3a_3x3/MaxPool | MaxPool | [[1 192 28 28]] | 37.667 | 602112 | 602112 | 47733248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 150528 | 0.00 | 601920.00 | 52.40 | 0.25 | 21.50 | true | 0.522174;0.522767;0.529632;0.525045;0.523367 | 150528;150528;150528;150528;150528 | 0;0;0;0;0 | 601920;601920;601952;601920;601824 |
14 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 192 28 28]] | 38 | 602112 | 602112 | 45926912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 3956397 | 4096.00 | 602944.00 | 53.60 | 6.52 | 439.60 | true | 0.532221;0.540188;0.532339;0.534794;0.539609 | 3956397;3956397;3956397;3956397;3956397 | 602944;602944;602944;602848;602944 | 4608;3584;4352;3584;4352 |
15 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98 | 200704 | 249856 | 46127616 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 19710976 | 341.33 | 251072.00 | 3.10 | 78.40 | 1159.47 | false | 0.031244;0.031244;0.031244;0.031244;0.031244 | 19710976;19710976;19710976;19710976;19710976 | 256;512;512;256;256 | 251168;250944;251072;251072;251072 |
15 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98 | 200704 | 249856 | 46127616 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 682.67 | 41.30 | 0.00 | 0.00 | true | 0.414791;0.411869;0.412016;0.412028;0.419436 | 0;0;0;0;0 | 49152;49152;49152;49152;49152 | 768;768;640;640;640 |
16 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97.333 | 200704 | 249856 | 46328320 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 19710976 | 0.00 | 198112.00 | 3.10 | 99.49 | 1159.47 | false | 0.031244;0.031244;0.031244;0.031244;0.031244 | 19710976;19710976;19710976;19710976;19710976 | 0;0;0;0;14336 | 192096;197600;198624;198112;199136 |
16 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 97.333 | 200704 | 249856 | 46328320 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 0.00 | 41.40 | 0.00 | 0.00 | true | 0.414294;0.414613;0.414451;0.414513;0.414056 | 0;0;0;0;0 | 49152;49152;49152;49152;49152 | 0;0;0;128;0 |
17 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 95.667 | 200704 | 249856 | 46529024 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 19710976 | 0.00 | 192672.00 | 3.10 | 102.30 | 1159.47 | false | 0.031244;0.031244;0.031244;0.031244;0.031244 | 19710976;19710976;19710976;19710976;19710976 | 0;0;0;0;0 | 171296;192800;193184;192672;192544 |
17 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 95.667 | 200704 | 249856 | 46529024 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 42.67 | 41.50 | 0.00 | 0.00 | true | 0.414465;0.414153;0.415020;0.414793;0.414739 | 0;0;0;0;0 | 49152;49152;50688;49152;49152 | 128;0;0;128;0 |
18 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 28 28]] | 94.333 | 100352 | 124928 | 46027264 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 16.33 | 9855488 | 0.00 | 672.00 | 3.10 | 14665.90 | 603.41 | false | 0.031245;0.031244;0.031244;0.031244;0.031244 | 9855488;9855488;9855488;9855488;9855488 | 1536;0;0;0;0 | 544;672;672;672;672 |
18 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 28 28]] | 94.333 | 100352 | 124928 | 46027264 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 24576.00 | 0.00 | 41.70 | 0.00 | 0.00 | true | 0.416711;0.416768;0.416720;0.420251;0.416788 | 0;0;0;0;0 | 24576;24576;24576;24576;24576 | 0;0;0;128;0 |
19 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 25.667 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 128.00 | 45.00 | 130.67 | 12.54 | false | 0.447835;0.449221;0.450544;0.451012;0.450107 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 256;128;128;128;128 |
20 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.30 | 196.00 | 12.54 | false | 0.444790;0.443109;0.442581;0.446067;0.442132 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;0;128 |
21 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 19 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.20 | 196.00 | 12.54 | false | 0.442582;0.441916;0.443034;0.442902;0.441876 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;384;0 |
22 | InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 32 28 28]] | 19.667 | 100352 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 128.00 | 42.67 | 43.50 | 147.00 | 6.27 | false | 0.435146;0.434373;0.434181;0.434570;0.436331 | 25088;25088;25088;25088;25088 | 0;128;128;0;0 | 128;128;128;128;128 |
23 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 18.667 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438387;0.438215;0.438599;0.438663;0.438785 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
24 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 17.667 | 200704 | 0 | 45425152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.435226;0.435288;0.435844;0.435558;0.435160 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 116.667 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 53329920 | 0.00 | 334901.33 | 12.50 | 159.24 | 2424.09 | false | 0.124669;0.124678;0.124671;0.124676;0.124678 | 53329920;53329920;53329920;53329920;53329920 | 326144;333248;336640;335456;336000 | 0;0;0;0;0 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 116.667 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 45.00 | 0.00 | 0.00 | true | 0.449492;0.449390;0.450835;0.451232;0.449174 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
25 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 116.667 | 301056 | 1137152 | 45726208 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 1578.67 | 6.20 | 225.73 | 89.09 | false | 0.062283;0.062289;0.062290;0.062280;0.062285 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 1408;3840;1664;1536;1536 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 112.667 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 35553280 | 0.00 | 42.67 | 12.50 | 833273.49 | 1616.06 | false | 0.124658;0.124669;0.124659;0.124667;0.124660 | 35553280;35553280;35553280;35553280;35553280 | 0;0;10496;0;0 | 0;128;128;0;0 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 112.667 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 147456.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.433294;0.434690;0.436369;0.435578;0.435380 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 0;0;0;0;128 |
26 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 112.667 | 200704 | 1003520 | 45726208 | GPU_0_bfc | 802816 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 237568 | 0.00 | 0.00 | 6.20 | 0.00 | 79.19 | true | 0.062296;0.062294;0.062298;0.062295;0.062294 | 237568;237568;237568;237568;237568 | 0;0;0;0;0 | 0;0;0;128;0 |
27 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 25 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 0.00 | 45.00 | 196.00 | 18.82 | false | 0.450135;0.452846;0.449640;0.450758;0.449669 | 75264;75264;75264;75264;75264 | 128;0;0;0;0 | 384;384;384;384;384 |
28 | InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 19.333 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.30 | 196.00 | 12.54 | false | 0.443728;0.441426;0.443632;0.442554;0.443232 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;0;128 |
29 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 18 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.00 | 0.00 | 0.00 | true | 0.440091;0.439921;0.440325;0.440238;0.440462 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 121.333 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 30.00 | 79478784 | 0.00 | 79616.00 | 12.50 | 998.28 | 2649.29 | false | 0.124760;0.124764;0.124763;0.124754;0.124763 | 79478784;79478784;79478784;79478784;79478784 | 0;0;0;0;0 | 81280;78464;80128;80000;78720 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 121.333 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331776.00 | 53.33 | 44.40 | 0.00 | 0.00 | true | 0.445893;0.442940;0.441081;0.443696;0.446512 | 0;0;0;0;0 | 331776;331776;331776;331776;331776 | 32;128;128;0;0 |
30 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 121.333 | 301056 | 1554944 | 45826560 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 115914.67 | 6.20 | 4.61 | 133.63 | true | 0.062242;0.062251;0.062247;0.062240;0.062246 | 534528;534528;534528;534528;534528 | 0;0;768;0;0 | 113952;117920;116128;115232;116384 |
31 | InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 25.333 | 301056 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 0.00 | 45.10 | 196.00 | 18.82 | false | 0.450894;0.449984;0.450921;0.450891;0.451380 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;0;0;128;0 |
33 | InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 256 28 28]] | 22.333 | 802816 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 168586.67 | 68.00 | 0.00 | 0.00 | true | 0.681515;0.679491;0.678763;0.680622;0.681220 | 0;0;0;0;0 | 0;0;0;0;0 | 168608;168416;169376;167904;168736 |
34 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 256 28 28]] | 42 | 802816 | 802816 | 46328320 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 4827031 | 0.00 | 43680.00 | 52.60 | 110.51 | 402.25 | false | 0.524827;0.526125;0.526834;0.525791;0.526898 | 4827031;4827031;4827031;4827031;4827031 | 0;0;0;0;0 | 43936;43680;43680;43680;43680 |
35 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 101.667 | 200704 | 266240 | 46529024 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 20.00 | 26264576 | 0.00 | 320.00 | 3.10 | 82076.80 | 1313.23 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 0;0;0;0;0 | 320;320;320;448;320 |
35 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 101.667 | 200704 | 266240 | 46529024 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 65536.00 | 42.67 | 41.20 | 0.00 | 0.00 | true | 0.413860;0.412121;0.412372;0.412407;0.412382 | 0;0;0;0;0 | 65536;65536;65536;65536;66048 | 128;0;0;0;128 |
36 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 99 | 200704 | 266240 | 46729728 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 20.00 | 26264576 | 0.00 | 2165.33 | 3.10 | 12129.58 | 1313.23 | false | 0.031246;0.031245;0.031245;0.031246;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 0;0;5376;0;0 | 2208;2080;2080;2208;2208 |
36 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 99 | 200704 | 266240 | 46729728 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 65536.00 | 42.67 | 41.40 | 0.00 | 0.00 | true | 0.414003;0.414382;0.415067;0.413483;0.419663 | 0;0;0;0;0 | 65536;65536;65536;65536;65536 | 0;128;128;0;0 |
37 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98.667 | 200704 | 266240 | 46930432 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 20.00 | 26264576 | 85.33 | 56768.00 | 3.10 | 461.97 | 1313.23 | false | 0.031245;0.031245;0.031245;0.031245;0.031246 | 26264576;26264576;26264576;26264576;26264576 | 0;0;256;2560;0 | 56768;56768;56768;56896;56768 |
37 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98.667 | 200704 | 266240 | 46930432 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 65536.00 | 0.00 | 41.30 | 0.00 | 0.00 | true | 0.412727;0.413266;0.417291;0.411787;0.412193 | 0;0;0;0;0 | 0;0;0;0;128 | 65536;65536;65536;65536;68864 |
38 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98.667 | 200704 | 266240 | 46328320 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 20.00 | 26264576 | 0.00 | 48330.67 | 3.10 | 543.43 | 1313.23 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 26264576;26264576;26264576;26264576;26264576 | 48288;48416;48416;48288;48288 | 768;0;0;0;0 |
38 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 98.667 | 200704 | 266240 | 46328320 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 65536.00 | 42.67 | 41.70 | 0.00 | 0.00 | true | 0.422231;0.413645;0.419846;0.413361;0.416555 | 0;0;0;0;0 | 65536;65536;65536;65536;65536 | 0;128;128;0;0 |
39 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 26 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 128.00 | 45.00 | 130.67 | 12.54 | false | 0.448977;0.452455;0.451792;0.449408;0.449192 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 128;128;128;128;256 |
40 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 85.33 | 44.30 | 147.00 | 12.54 | false | 0.443426;0.440984;0.441530;0.442830;0.443189 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 128;0;0;256;128 |
41 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 20 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 170.67 | 44.30 | 117.60 | 12.54 | false | 0.442999;0.442588;0.443714;0.444142;0.442959 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 128;256;256;128;128 |
42 | InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 19.667 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 47018.67 | 44.40 | 1.06 | 12.54 | true | 0.443895;0.444231;0.442274;0.444230;0.443104 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 46976;47232;47232;46848;46848 |
43 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 18.333 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438722;0.438583;0.438560;0.438430;0.438670 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
44 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 17.667 | 200704 | 0 | 45525504 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.434443;0.434444;0.434976;0.435158;0.435050 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 53329920 | 0.00 | 106432.00 | 12.50 | 501.07 | 2424.09 | false | 0.124673;0.124676;0.124668;0.124659;0.124659 | 53329920;53329920;53329920;53329920;53329920 | 0;0;0;0;0 | 106176;106560;106560;106304;106432 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 42.67 | 44.60 | 0.00 | 0.00 | true | 0.446187;0.446879;0.445582;0.445410;0.445365 | 0;0;0;0;0 | 221184;221184;221184;221184;221440 | 0;128;128;0;0 |
45 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 115.667 | 301056 | 1137152 | 45826560 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 85.33 | 6.20 | 4176.02 | 89.09 | false | 0.062294;0.062303;0.062293;0.062274;0.062297 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 256;0;0;128;128 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.333 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 53329920 | 0.00 | 0.00 | 12.50 | 0.00 | 2424.09 | true | 0.124682;0.124666;0.124671;0.124664;0.124673 | 53329920;53329920;53329920;53329920;53329920 | 128;0;0;0;0 | 0;0;0;0;0 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.333 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 45.00 | 0.00 | 0.00 | true | 0.451332;0.443940;0.452688;0.452357;0.444933 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
46 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 112.333 | 301056 | 1317376 | 45926912 | GPU_0_bfc | 1016320 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 42.67 | 6.20 | 8351.93 | 97.18 | false | 0.062278;0.062284;0.062284;0.062288;0.062286 | 356352;356352;356352;356352;356352 | 0;128;128;0;0 | 0;0;0;0;0 |
47 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 24.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 0.00 | 45.00 | 196.00 | 18.82 | false | 0.449835;0.450730;0.450587;0.450364;0.449868 | 75264;75264;75264;75264;75264 | 384;384;384;640;384 | 0;0;0;0;128 |
48 | InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 20 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 0.00 | 44.60 | 196.00 | 18.82 | false | 0.445850;0.446370;0.445930;0.445947;0.444383 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;0;0;128;0 |
49 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 18.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 44.00 | 0.00 | 0.00 | true | 0.439807;0.439910;0.439835;0.439446;0.440095 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;128;0;0 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 119.667 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 30.00 | 79478784 | 85.33 | 76832.00 | 12.50 | 1033.30 | 2649.29 | false | 0.124763;0.124756;0.124751;0.124753;0.124757 | 79478784;79478784;79478784;79478784;79478784 | 0;0;256;0;512 | 76704;77984;76960;76576;76832 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 119.667 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331776.00 | 0.00 | 44.80 | 0.00 | 0.00 | true | 0.450764;0.444559;0.444448;0.449552;0.449798 | 0;0;0;0;0 | 128;0;0;0;0 | 331776;331776;331776;331776;331776 |
50 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 119.667 | 301056 | 1554944 | 46027264 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 29941.33 | 6.20 | 17.85 | 133.63 | true | 0.062234;0.062236;0.062249;0.062245;0.062243 | 534528;534528;534528;534528;534528 | 29728;28704;29856;30496;30240 | 0;0;0;0;0 |
51 | InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 23.667 | 301056 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 42.67 | 45.10 | 176.40 | 18.82 | false | 0.450797;0.450060;0.451132;0.451765;0.450405 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 0;128;128;0;0 |
53 | InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 320 28 28]] | 20.667 | 1003520 | 0 | 45726208 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 70570.67 | 65.00 | 0.00 | 0.00 | true | 0.652252;0.650880;0.649794;0.650487;0.649741 | 0;0;0;0;0 | 0;0;0;0;0 | 70400;69504;70656;71040;70656 |
54 | InceptionV2/InceptionV2/Mixed_4a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 320 14 14]] | 35.333 | 250880 | 250880 | 45977088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 62720 | 0.00 | 448.00 | 29.20 | 140.00 | 12.54 | false | 0.292101;0.292586;0.292188;0.292194;0.292627 | 62720;62720;62720;62720;62720 | 256;0;0;0;0 | 512;384;416;416;640 |
55 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 103.333 | 200704 | 282624 | 46177792 | GPU_0_bfc | 81920 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 24.00 | 32818176 | 0.00 | 362.67 | 3.10 | 90491.21 | 1367.42 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 32818176;32818176;32818176;32818176;32818176 | 0;0;0;9984;0 | 320;448;448;320;320 |
55 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 28 28]] | 103.333 | 200704 | 282624 | 46177792 | GPU_0_bfc | 81920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 81920.00 | 0.00 | 42.00 | 0.00 | 0.00 | true | 0.425286;0.419683;0.419604;0.421369;0.420307 | 0;0;0;0;0 | 81920;81920;81920;81920;81920 | 0;0;0;128;0 |
56 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 28 28]] | 114 | 401408 | 565248 | 46579200 | GPU_0_bfc | 163840 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 35.67 | 65636352 | 0.00 | 2250.67 | 3.90 | 29163.07 | 1840.25 | false | 0.039281;0.039186;0.039322;0.039133;0.039315 | 65636352;65636352;65636352;65636352;65636352 | 0;0;0;0;0 | 2336;2208;2208;2080;2336 |
56 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 28 28]] | 114 | 401408 | 565248 | 46579200 | GPU_0_bfc | 163840 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 163840.00 | 0.00 | 41.90 | 0.00 | 0.00 | true | 0.419378;0.418669;0.421219;0.418664;0.418591 | 0;0;0;0;0 | 164352;163840;163840;163840;163840 | 128;0;0;0;0 |
57 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 28 28]] | 25.333 | 200704 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 256.00 | 0.00 | 44.90 | 196.00 | 12.54 | false | 0.448515;0.447420;0.448652;0.448563;0.449200 | 50176;50176;50176;50176;50176 | 256;256;256;256;256 | 0;0;0;128;0 |
58 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 28 28]] | 19 | 401408 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 100352 | 512.00 | 42.67 | 50.30 | 180.92 | 25.09 | false | 0.501698;0.502962;0.503403;0.503355;0.502264 | 100352;100352;100352;100352;100352 | 512;512;512;512;512 | 0;128;128;0;0 |
59 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 28 28]] | 17.333 | 200704 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438632;0.438718;0.438421;0.438523;0.439560 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
60 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 128 28 28]] | 17.333 | 401408 | 0 | 45575680 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 53.00 | 0.00 | 0.00 | true | 0.529826;0.529367;0.530363;0.529924;0.528877 | 0;0;0;0;0 | 0;256;0;0;0 | 0;0;0;0;128 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 113.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 53329920 | 0.00 | 140725.33 | 12.50 | 378.96 | 2424.09 | false | 0.124655;0.124670;0.124670;0.124674;0.124676 | 53329920;53329920;53329920;53329920;53329920 | 0;0;0;0;0 | 140992;142016;140704;139840;140480 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 113.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 45.20 | 0.00 | 0.00 | true | 0.453436;0.452693;0.452265;0.451828;0.451536 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
61 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 28 28]] | 113.333 | 301056 | 1137152 | 45876736 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 597.33 | 6.20 | 596.57 | 89.09 | false | 0.062309;0.062309;0.062296;0.062292;0.062302 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 512;640;640;512;640 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 166.667 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 72.33 | 82606720 | 0.00 | 704.00 | 3.10 | 117339.09 | 1142.03 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 82606720;82606720;82606720;82606720;82606720 | 0;0;0;0;0 | 576;832;704;832;576 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 166.667 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 737280.00 | 55594.67 | 44.90 | 0.00 | 0.00 | true | 0.455710;0.448891;0.447460;0.450463;0.447197 | 0;0;0;0;0 | 737280;737280;737280;737280;737280 | 55808;55552;55680;54912;55552 |
62 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 166.667 | 200704 | 1368576 | 45876736 | GPU_0_bfc | 1167872 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 5.00 | 0 | 3072.00 | 298.67 | 47.60 | 0.00 | 0.00 | true | 0.475371;0.475955;0.476267;0.475746;0.476046 | 0;0;0;0;0 | 3072;3072;3584;3072;3072 | 256;256;384;256;384 |
63 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 28 28]] | 25 | 301056 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 75264 | 384.00 | 384.00 | 45.00 | 98.00 | 18.82 | false | 0.450784;0.448805;0.449861;0.451350;0.450080 | 75264;75264;75264;75264;75264 | 384;384;384;384;384 | 512;384;384;384;384 |
64 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 20 | 200704 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 0.00 | 44.70 | 49.00 | 7.84 | false | 0.446498;0.446583;0.445455;0.446849;0.447163 | 31360;31360;31360;31360;31360 | 0;0;0;0;128 | 640;640;6016;640;640 |
65 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 96 28 28]] | 17.667 | 301056 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 43.90 | 0.00 | 0.00 | true | 0.438963;0.438964;0.438645;0.438738;0.438825 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;256;128;0 |
66 | InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 160 14 14]] | 16.667 | 200704 | 0 | 45475328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 43.60 | 0.00 | 0.00 | true | 0.436142;0.435897;0.436247;0.435828;0.435851 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;1664;0;0 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 145 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 56.00 | 37177728 | 0.00 | 32.00 | 3.10 | 1161804.00 | 663.89 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 37177728;37177728;37177728;37177728;37177728 | 0;0;0;0;0 | 32;32;32;160;32 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 145 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331776.00 | 93920.00 | 44.80 | 0.00 | 0.00 | true | 0.449263;0.444483;0.449573;0.444823;0.448792 | 0;0;0;0;0 | 331776;331776;331776;331776;331776 | 93184;93440;88192;95488;95136 |
67 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 145 | 75264 | 733184 | 45550592 | GPU_0_bfc | 657920 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4.00 | 0 | 0.00 | 1322.67 | 47.70 | 0.00 | 0.00 | true | 0.476733;0.476639;0.476665;0.476710;0.476862 | 0;0;0;0;0 | 0;0;0;0;0 | 2176;1536;4864;256;256 |
68 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 24.333 | 75264 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 554.67 | 85.33 | 44.50 | 29.40 | 4.70 | false | 0.445241;0.445820;0.445219;0.446333;0.444281 | 18816;18816;18816;18816;18816 | 896;384;384;384;896 | 128;128;128;0;0 |
69 | InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 96 14 14]] | 18.667 | 75264 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.436796;0.436939;0.437265;0.437061;0.437120 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
71 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 39 | 526848 | 526848 | 45701120 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 2893038 | 768.00 | 105845.33 | 43.10 | 27.14 | 361.63 | true | 0.429782;0.431800;0.432528;0.429865;0.431872 | 2893038;2893038;2893038;2893038;2893038 | 256;768;768;768;768 | 105888;105760;105760;105888;105888 |
72 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 0.00 | 117.33 | 3.10 | 211291.15 | 635.68 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 32;160;160;160;32 | 0;0;0;0;1536 |
72 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 8192.00 | 44.40 | 0.00 | 0.00 | true | 0.443767;0.444270;0.444387;0.438173;0.445343 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 7808;8064;8320;8320;8192 |
73 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 14 14]] | 114 | 50176 | 197632 | 45826560 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 16527616 | 0.00 | 42.67 | 3.10 | 387362.97 | 423.79 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 16527616;16527616;16527616;16527616;16527616 | 128;128;0;0;0 | 0;256;0;0;0 |
73 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 14 14]] | 114 | 50176 | 197632 | 45826560 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 147456.00 | 82816.00 | 43.30 | 0.00 | 0.00 | true | 0.432550;0.432631;0.433391;0.434575;0.429690 | 0;0;0;0;0 | 83584;81280;81152;83584;83584 | 147456;147456;147456;149504;147456 |
74 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 14 14]] | 116 | 175616 | 691712 | 46002176 | GPU_0_bfc | 516096 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 57846656 | 0.00 | 42.67 | 3.10 | 1355770.41 | 1483.25 | false | 0.031246;0.031246;0.031245;0.031246;0.031245 | 57846656;57846656;57846656;57846656;57846656 | 0;0;0;0;0 | 0;128;0;128;0 |
74 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 14 14]] | 116 | 175616 | 691712 | 46002176 | GPU_0_bfc | 516096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 516096.00 | 2538.67 | 45.60 | 0.00 | 0.00 | true | 0.455095;0.455476;0.456189;0.456465;0.455757 | 0;0;0;0;0 | 516096;516096;516096;516096;516096 | 2496;2720;2368;2560;2560 |
75 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114 | 100352 | 451584 | 45650944 | GPU_0_bfc | 351232 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 85.33 | 42.67 | 3.10 | 258244.00 | 847.57 | false | 0.031248;0.031248;0.031248;0.031247;0.031248 | 33055232;33055232;33055232;33055232;33055232 | 256;0;0;8192;0 | 128;128;0;0;0 |
75 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114 | 100352 | 451584 | 45650944 | GPU_0_bfc | 351232 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 0.00 | 45.20 | 0.00 | 0.00 | true | 0.453394;0.451935;0.451849;0.449043;0.451047 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 0;128;0;0;0 |
76 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 25.667 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 0.00 | 44.60 | 49.00 | 4.70 | false | 0.445996;0.445277;0.445920;0.445888;0.445531 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;0;0;128 |
77 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 14 14]] | 18.667 | 50176 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12544 | 256.00 | 0.00 | 41.90 | 49.00 | 3.14 | false | 0.420036;0.418916;0.417460;0.420405;0.419207 | 12544;12544;12544;12544;12544 | 256;256;256;256;256 | 0;0;0;128;0 |
78 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 14 14]] | 19 | 175616 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 43904 | 896.00 | 0.00 | 45.00 | 49.00 | 10.98 | false | 0.451699;0.450012;0.450087;0.450618;0.450416 | 43904;43904;43904;43904;43904 | 896;896;896;896;896 | 0;32;0;0;0 |
79 | InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19 | 100352 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.60 | 49.00 | 6.27 | false | 0.436278;0.434861;0.435311;0.436030;0.436755 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 128;0;0;0;0 |
80 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 17.667 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.435936;0.436310;0.436180;0.436191;0.435910 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
81 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 14 14]] | 17.333 | 50176 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.10 | 0.00 | 0.00 | true | 0.430636;0.430578;0.430175;0.430293;0.430740 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;128;0 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 29.00 | 26492928 | 0.00 | 64.00 | 12.50 | 413952.00 | 913.55 | false | 0.124791;0.124807;0.124797;0.124796;0.124800 | 26492928;26492928;26492928;26492928;26492928 | 0;0;0;0;0 | 64;64;64;64;192 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 2645.33 | 45.10 | 0.00 | 0.00 | true | 0.452143;0.452507;0.449173;0.449581;0.451182 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 2688;2560;2688;2560;2688 |
82 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.667 | 100352 | 1856512 | 45224448 | GPU_0_bfc | 1756160 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 712704 | 0.00 | 938.67 | 7.50 | 759.27 | 178.18 | false | 0.074897;0.074826;0.075055;0.074789;0.074881 | 712704;712704;712704;712704;712704 | 0;0;0;0;0 | 1024;896;896;896;1024 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 112.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 13332480 | 0.00 | 0.00 | 12.50 | 0.00 | 634.88 | true | 0.124729;0.124724;0.124713;0.124728;0.124717 | 13332480;13332480;13332480;13332480;13332480 | 0;0;0;0;0 | 128;0;0;0;0 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 112.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 6314.67 | 44.60 | 0.00 | 0.00 | true | 0.449814;0.450421;0.444229;0.444226;0.444417 | 0;0;0;0;0 | 5888;6272;6528;6400;6272 | 221184;221184;221184;221184;221184 |
83 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 112.667 | 75264 | 941056 | 45224448 | GPU_0_bfc | 865792 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 0.00 | 6.20 | 0.00 | 97.18 | true | 0.062239;0.062240;0.062247;0.062255;0.062250 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 0;0;512;0;0 |
84 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 24.333 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.90 | 49.00 | 6.27 | false | 0.437592;0.438853;0.440100;0.439330;0.439351 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;0;128 |
85 | InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 19.333 | 75264 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 896.00 | 42.67 | 44.30 | 20.05 | 4.70 | true | 0.442633;0.443326;0.442978;0.442816;0.442464 | 18816;18816;18816;18816;18816 | 0;0;0;128;128 | 7552;384;384;1920;384 |
86 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 128 14 14]] | 17.667 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.434514;0.434885;0.435627;0.435160;0.435395 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 127.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.00 | 35209216 | 0.00 | 201440.00 | 12.50 | 174.79 | 951.60 | false | 0.124841;0.124852;0.124832;0.124849;0.124849 | 35209216;35209216;35209216;35209216;35209216 | 0;0;0;0;0 | 203136;190752;204256;203520;197664 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 127.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 589824.00 | 59690.67 | 43.80 | 0.00 | 0.00 | true | 0.439367;0.436807;0.437500;0.438133;0.448659 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 60160;59648;58816;60032;59392 |
87 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 127.667 | 100352 | 2329088 | 45274624 | GPU_0_bfc | 2228736 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 950272 | 0.00 | 43114.67 | 9.70 | 22.04 | 237.57 | true | 0.097286;0.097204;0.097263;0.097142;0.096927 | 950272;950272;950272;950272;950272 | 37504;60256;36736;43264;48576 | 0;512;0;0;0 |
88 | InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 24 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 1280.00 | 42.67 | 44.00 | 18.97 | 6.27 | true | 0.440182;0.440024;0.439603;0.439368;0.439606 | 25088;25088;25088;25088;25088 | 0;128;128;0;0 | 512;512;7680;512;2816 |
90 | InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 21.333 | 526848 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 27690.67 | 58.70 | 0.00 | 0.00 | true | 0.587874;0.586525;0.577947;0.586865;0.586216 | 0;0;0;0;0 | 0;0;0;0;0 | 28416;25984;27776;28672;26880 |
91 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 38.667 | 451584 | 451584 | 45701120 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 2514363 | 0.00 | 161877.33 | 42.20 | 15.53 | 279.37 | true | 0.421228;0.420434;0.423730;0.421273;0.422427 | 2514363;2514363;2514363;2514363;2514363 | 0;5632;0;0;0 | 162848;150144;163488;162848;159936 |
92 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 115.667 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 170.67 | 104778.67 | 3.10 | 236.22 | 635.68 | false | 0.031248;0.031247;0.031247;0.031247;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 1792;0;256;0;256 | 106208;101280;108128;106528;101600 |
92 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 115.667 | 75264 | 296448 | 45776384 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 74.67 | 44.50 | 0.00 | 0.00 | true | 0.443269;0.443086;0.447214;0.444884;0.447893 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;96;256;128 |
93 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114.667 | 75264 | 296448 | 45851648 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 0.00 | 38517.33 | 3.10 | 643.64 | 635.68 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 40352;35104;38560;44448;36640 |
93 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 114.667 | 75264 | 296448 | 45851648 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221440.00 | 6058.67 | 44.60 | 0.00 | 0.00 | true | 0.443058;0.447248;0.448065;0.442550;0.448451 | 0;0;0;0;0 | 221184;226560;221952;221184;221184 | 4096;8192;5504;4480;8320 |
94 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 116.333 | 150528 | 592896 | 46002176 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 49582848 | 0.00 | 38304.00 | 3.10 | 1294.46 | 1271.36 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 49582848;49582848;49582848;49582848;49582848 | 0;0;0;0;0 | 35872;40992;43936;34464;38048 |
94 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 116.333 | 150528 | 592896 | 46002176 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 92416.00 | 45.60 | 0.00 | 0.00 | true | 0.456106;0.453104;0.457131;0.460050;0.455912 | 0;0;0;0;0 | 98720;89408;88896;97184;90656 | 442368;450048;442368;442368;442368 |
95 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114.333 | 100352 | 526848 | 45575680 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 0.00 | 288.00 | 3.10 | 114775.11 | 847.57 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;11264;0;0 | 288;288;288;288;416 |
95 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 114.333 | 100352 | 526848 | 45575680 | GPU_0_bfc | 426496 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 4096.00 | 44.20 | 0.00 | 0.00 | true | 0.439556;0.444128;0.441829;0.441314;0.442043 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 4224;1792;4096;4096;4096 |
96 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 25.667 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 128.00 | 44.50 | 36.75 | 4.70 | false | 0.444650;0.443791;0.445413;0.445021;0.445799 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 128;128;128;256;128 |
97 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 20 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 42.67 | 44.30 | 44.10 | 4.70 | false | 0.438976;0.442132;0.442899;0.443304;0.442991 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;128;128;0;0 |
98 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 19.667 | 150528 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 0.00 | 44.20 | 49.00 | 9.41 | false | 0.441740;0.442868;0.441565;0.441824;0.443037 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 128;0;0;0;0 |
99 | InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 18.333 | 100352 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.40 | 49.00 | 6.27 | false | 0.436471;0.434777;0.434002;0.434407;0.434069 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;0;128 |
100 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 18.333 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.435873;0.435911;0.436087;0.436205;0.436721 | 0;0;0;0;0 | 0;0;0;128;0 | 0;0;0;0;1536 |
101 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 96 14 14]] | 17 | 75264 | 0 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 43.50 | 0.00 | 0.00 | true | 0.434845;0.434653;0.434702;0.434765;0.434581 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;128;0;0 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 122 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 29.00 | 26492928 | 0.00 | 51424.00 | 12.50 | 515.19 | 913.55 | false | 0.124818;0.124798;0.124792;0.124797;0.124803 | 26492928;26492928;26492928;26492928;26492928 | 0;0;0;0;0 | 53760;51136;52064;50400;51072 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 122 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 28672.00 | 45.10 | 0.00 | 0.00 | true | 0.449971;0.451412;0.452009;0.451559;0.451485 | 0;0;0;0;0 | 442368;442368;442368;442368;442368 | 28672;25088;28800;28544;29056 |
102 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 122 | 100352 | 2107392 | 45224448 | GPU_0_bfc | 2007040 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 712704 | 0.00 | 118698.67 | 7.50 | 6.00 | 178.18 | true | 0.074999;0.074794;0.075005;0.074977;0.075125 | 712704;712704;712704;712704;712704 | 0;0;0;0;0 | 116864;117248;120608;119936;118912 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 120 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 29.00 | 26492928 | 0.00 | 0.00 | 12.50 | 0.00 | 913.55 | true | 0.124800;0.124797;0.124798;0.124782;0.124802 | 26492928;26492928;26492928;26492928;26492928 | 0;0;0;0;0 | 0;0;0;0;128 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 120 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 42.67 | 44.90 | 0.00 | 0.00 | true | 0.449853;0.449011;0.450662;0.449425;0.445193 | 0;0;0;0;0 | 442624;442368;442368;442368;442368 | 0;128;128;0;0 |
103 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 120 | 100352 | 2082304 | 45249536 | GPU_0_bfc | 1981952 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 712704 | 0.00 | 0.00 | 7.50 | 0.00 | 178.18 | true | 0.075430;0.075424;0.075613;0.075487;0.075581 | 712704;712704;712704;712704;712704 | 0;0;0;0;512 | 128;0;0;0;0 |
104 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 24 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.90 | 49.00 | 6.27 | false | 0.439403;0.438991;0.439233;0.439034;0.439493 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;128;0 |
105 | InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 42.67 | 43.50 | 45.23 | 6.27 | false | 0.435932;0.435991;0.431382;0.435008;0.435092 | 25088;25088;25088;25088;25088 | 0;128;128;0;0 | 512;512;512;512;512 |
106 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 128 14 14]] | 18 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.436554;0.436284;0.434733;0.436235;0.435561 | 0;0;0;0;0 | 128;0;0;0;0 | 0;0;0;0;0 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.333 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.00 | 35209216 | 0.00 | 180298.67 | 12.50 | 195.28 | 951.60 | false | 0.124851;0.124841;0.124848;0.124840;0.124854 | 35209216;35209216;35209216;35209216;35209216 | 0;0;0;0;0 | 152736;179872;181152;179872;182304 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.333 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 590592.00 | 426.67 | 44.00 | 0.00 | 0.00 | true | 0.435549;0.443055;0.438981;0.444489;0.436622 | 0;0;0;0;0 | 589824;589824;589824;596224;592128 | 384;384;512;384;640 |
107 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 126.333 | 100352 | 2466816 | 45274624 | GPU_0_bfc | 2366464 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 950272 | 0.00 | 49493.33 | 9.70 | 19.20 | 237.57 | true | 0.097487;0.097287;0.097368;0.097344;0.097480 | 950272;950272;950272;950272;950272 | 0;5632;0;0;0 | 76672;49280;48512;50688;47744 |
108 | InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 24 | 100352 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.90 | 49.00 | 6.27 | false | 0.439132;0.439486;0.439549;0.439191;0.440426 | 25088;25088;25088;25088;25088 | 128;0;0;0;0 | 1280;512;512;512;512 |
110 | InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 21 | 727552 | 0 | 45450240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 58.50 | 0.00 | 0.00 | true | 0.576124;0.584106;0.584786;0.585277;0.584662 | 0;0;0;0;0 | 0;0;0;0;128 | 0;0;1536;0;0 |
111 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 38 | 451584 | 451584 | 45901824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 2461563 | 0.00 | 1792.00 | 42.40 | 1373.64 | 273.51 | false | 0.423610;0.423818;0.428090;0.423360;0.423245 | 2461563;2461563;2461563;2461563;2461563 | 0;0;0;0;0 | 1792;1792;1792;2048;1792 |
112 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 109.667 | 100352 | 401408 | 46002176 | GPU_0_bfc | 301056 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 0.00 | 970.67 | 3.10 | 34054.14 | 847.57 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;0;0;0 | 1056;928;928;928;1056 |
112 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 109.667 | 100352 | 401408 | 46002176 | GPU_0_bfc | 301056 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 426.67 | 44.70 | 0.00 | 0.00 | true | 0.446014;0.446661;0.447385;0.438031;0.448349 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 384;640;512;384;384 |
113 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 117.667 | 100352 | 395264 | 46102528 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 0.00 | 202.67 | 3.10 | 163101.21 | 847.57 | false | 0.031247;0.031248;0.031248;0.031247;0.031248 | 33055232;33055232;33055232;33055232;33055232 | 0;0;512;0;0 | 160;160;14368;288;160 |
113 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 117.667 | 100352 | 395264 | 46102528 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 85504.00 | 45.00 | 0.00 | 0.00 | true | 0.451736;0.442594;0.451769;0.450141;0.447442 | 0;0;0;0;0 | 85888;85760;85376;85376;85120 | 294912;294912;294912;294912;294912 |
114 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 116.667 | 200704 | 569344 | 46303232 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 41319040 | 0.00 | 74.67 | 3.10 | 553377.53 | 1059.46 | false | 0.031248;0.031247;0.031247;0.031247;0.031248 | 41319040;41319040;41319040;41319040;41319040 | 0;0;0;0;0 | 32;32;32;2848;160 |
114 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 116.667 | 200704 | 569344 | 46303232 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 368640.00 | 100778.67 | 46.20 | 0.00 | 0.00 | true | 0.457499;0.461591;0.462781;0.462323;0.461038 | 0;0;0;0;0 | 368640;368640;368640;368896;368640 | 100608;100608;86144;104704;101120 |
115 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 115.333 | 75264 | 296448 | 45650944 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 0.00 | 32.00 | 3.10 | 774732.00 | 635.68 | false | 0.031247;0.031247;0.031248;0.031248;0.031248 | 24791424;24791424;24791424;24791424;24791424 | 32;32;32;160;32 | 0;0;0;0;0 |
115 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 115.333 | 75264 | 296448 | 45650944 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 101376.00 | 44.60 | 0.00 | 0.00 | true | 0.446511;0.447432;0.446271;0.446307;0.443342 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 102656;102656;98816;96256;102656 |
116 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 25.333 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 44.00 | 49.00 | 6.27 | false | 0.439647;0.440171;0.435966;0.439534;0.439623 | 25088;25088;25088;25088;25088 | 512;512;2048;512;512 | 0;0;128;0;0 |
117 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19.333 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.50 | 49.00 | 6.27 | false | 0.434559;0.434816;0.433389;0.434500;0.435029 | 25088;25088;25088;25088;25088 | 512;512;512;512;1792 | 0;0;0;0;0 |
118 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 19.333 | 200704 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 0.00 | 44.70 | 49.00 | 7.84 | false | 0.447967;0.447276;0.447338;0.447724;0.447281 | 31360;31360;31360;31360;31360 | 0;0;0;0;0 | 640;640;640;640;640 |
119 | InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 18.667 | 75264 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 0.00 | 44.20 | 49.00 | 4.70 | false | 0.442970;0.441593;0.442311;0.441929;0.442793 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;0;128;0 |
120 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 17.333 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.437076;0.437354;0.436973;0.437315;0.436976 | 0;0;0;0;0 | 7424;0;0;0;0 | 0;0;128;0;0 |
121 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 16.667 | 100352 | 0 | 45199360 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.20 | 0.00 | 0.00 | true | 0.432269;0.432202;0.432185;0.431675;0.432292 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 131 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.00 | 44011520 | 0.00 | 50293.33 | 12.50 | 875.10 | 1189.50 | false | 0.124846;0.124842;0.124844;0.124851;0.124846 | 44011520;44011520;44011520;44011520;44011520 | 0;0;0;0;0 | 49248;52000;49376;49504;53024 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 131 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 737280.00 | 60960.00 | 44.90 | 0.00 | 0.00 | true | 0.449004;0.441417;0.449082;0.449642;0.456424 | 0;0;0;0;0 | 737280;737280;737280;737280;744832 | 60960;61088;59808;60960;60960 |
122 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 131 | 125440 | 2911488 | 45324800 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 1187840 | 0.00 | 71040.00 | 11.20 | 16.72 | 296.96 | true | 0.111505;0.111469;0.111587;0.111671;0.111840 | 1187840;1187840;1187840;1187840;1187840 | 71488;68672;71936;71840;69792 | 0;0;0;0;0 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 130.333 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.00 | 44011520 | 170.67 | 25322.67 | 12.50 | 1726.39 | 1189.50 | false | 0.124832;0.124841;0.124825;0.124841;0.124850 | 44011520;44011520;44011520;44011520;44011520 | 0;0;0;512;512 | 26944;24896;22848;28096;24128 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 130.333 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 737280.00 | 2474.67 | 44.60 | 0.00 | 0.00 | true | 0.444184;0.451384;0.444995;0.449521;0.444247 | 0;0;0;0;0 | 737280;737280;737280;737280;742656 | 2944;2176;2688;2560;1536 |
123 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 130.333 | 125440 | 2911488 | 45349888 | GPU_0_bfc | 2786048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 1187840 | 0.00 | 78112.00 | 11.00 | 15.21 | 296.96 | true | 0.110309;0.110205;0.110315;0.110233;0.110550 | 1187840;1187840;1187840;1187840;1187840 | 0;0;0;0;0 | 78112;77984;82208;77344;78240 |
124 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 24.333 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 298.67 | 45.10 | 33.41 | 7.84 | false | 0.451207;0.451060;0.449561;0.450829;0.450603 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 384;256;512;256;256 |
125 | InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 19.667 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 0.00 | 44.70 | 49.00 | 7.84 | false | 0.446022;0.447065;0.445010;0.448371;0.446929 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 128;0;0;0;0 |
126 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 160 14 14]] | 18 | 125440 | 0 | 45249536 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 1450.67 | 43.70 | 0.00 | 0.00 | true | 0.436865;0.437156;0.437238;0.436932;0.437178 | 0;0;0;0;0 | 1536;1536;1536;1280;1280 | 0;0;0;0;0 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 136 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 45.00 | 54906880 | 21.33 | 348320.00 | 12.50 | 157.62 | 1220.15 | false | 0.124876;0.124878;0.124874;0.124868;0.124879 | 54906880;54906880;54906880;54906880;54906880 | 0;0;64;64;0 | 350048;328768;346944;348512;349504 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 136 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 921600.00 | 23104.00 | 44.50 | 0.00 | 0.00 | true | 0.443019;0.443507;0.447960;0.452146;0.443079 | 0;0;0;0;0 | 921600;921600;921600;921600;921600 | 23232;23296;23232;22784;22848 |
127 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 136 | 200704 | 3683072 | 45450240 | GPU_0_bfc | 3482368 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 1484800 | 42.67 | 138933.33 | 13.50 | 10.68 | 342.67 | true | 0.135768;0.135486;0.135043;0.135433;0.134790 | 1484800;1484800;1484800;1484800;1484800 | 138016;153376;140160;138016;138624 | 64;5440;0;0;64 |
128 | InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 24 | 200704 | 0 | 45324800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 725.33 | 45.10 | 22.97 | 7.84 | true | 0.451028;0.451200;0.450611;0.452486;0.451196 | 31360;31360;31360;31360;31360 | 640;640;768;768;1024 | 640;640;640;640;640 |
130 | InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 20.667 | 451584 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 24693.33 | 58.50 | 0.00 | 0.00 | true | 0.584507;0.586858;0.584322;0.585805;0.584195 | 0;0;0;0;0 | 0;0;0;0;0 | 24992;24352;24096;24736;24992 |
131 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 576 14 14]] | 37.333 | 727552 | 727552 | 45901824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 2417538 | 170.67 | 326090.67 | 42.50 | 7.41 | 268.62 | true | 0.425354;0.423675;0.425277;0.424296;0.426494 | 2417538;2417538;2417538;2417538;2417538 | 256;0;256;0;256 | 326112;325504;326112;326368;326048 |
132 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 119.333 | 125440 | 494080 | 46027264 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 41319040 | 170.67 | 315200.00 | 3.10 | 131.02 | 1059.46 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 41319040;41319040;41319040;41319040;41319040 | 256;0;256;0;256 | 309824;308672;324032;320704;315072 |
132 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 14 14]] | 119.333 | 125440 | 494080 | 46027264 | GPU_0_bfc | 368640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 368640.00 | 77130.67 | 45.60 | 0.00 | 0.00 | true | 0.458219;0.458295;0.459421;0.449820;0.452302 | 0;0;0;0;0 | 368640;368640;368640;368640;368640 | 79520;85152;72352;72864;79008 |
133 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 116 | 100352 | 395264 | 46127616 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 0.00 | 23360.00 | 3.10 | 1415.04 | 847.57 | false | 0.031247;0.031247;0.031248;0.031247;0.031247 | 33055232;33055232;33055232;33055232;33055232 | 0;0;0;0;0 | 23360;23360;23488;23360;23360 |
133 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 116 | 100352 | 395264 | 46127616 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 0.00 | 44.10 | 0.00 | 0.00 | true | 0.443036;0.435226;0.444155;0.436435;0.445304 | 0;0;0;0;0 | 0;0;0;128;0 | 294912;294912;294912;294912;294912 |
134 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116.667 | 75264 | 296448 | 46202880 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 0.00 | 917.33 | 3.10 | 27025.54 | 635.68 | false | 0.031248;0.031247;0.031247;0.031248;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 832;960;1088;832;960 |
134 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 116.667 | 75264 | 296448 | 46202880 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 1024.00 | 44.40 | 0.00 | 0.00 | true | 0.443717;0.443453;0.444323;0.445510;0.443702 | 0;0;0;0;0 | 1152;1024;768;1024;1024 | 221184;221184;221184;221184;221184 |
135 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 113.667 | 75264 | 451584 | 45826560 | GPU_0_bfc | 376320 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 24791424 | 0.00 | 288.00 | 3.10 | 86081.33 | 635.68 | false | 0.031248;0.031248;0.031247;0.031247;0.031247 | 24791424;24791424;24791424;24791424;24791424 | 0;0;0;0;0 | 288;288;416;288;288 |
135 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 14 14]] | 113.667 | 75264 | 451584 | 45826560 | GPU_0_bfc | 376320 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 0.00 | 44.40 | 0.00 | 0.00 | true | 0.443808;0.448196;0.444018;0.444815;0.444007 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 0;0;0;128;0 |
136 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 14 14]] | 25.667 | 125440 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 31360 | 640.00 | 170.67 | 45.10 | 38.68 | 7.84 | false | 0.450574;0.450305;0.451597;0.450595;0.450431 | 31360;31360;31360;31360;31360 | 640;640;640;640;640 | 256;256;128;128;128 |
137 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19.667 | 100352 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.50 | 49.00 | 6.27 | false | 0.435219;0.435287;0.432395;0.435551;0.434325 | 25088;25088;25088;25088;25088 | 0;0;0;0;128 | 512;512;512;512;512 |
138 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 18.667 | 75264 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 384.00 | 0.00 | 44.20 | 49.00 | 4.70 | false | 0.442310;0.441752;0.442511;0.441886;0.441753 | 18816;18816;18816;18816;18816 | 384;384;384;384;384 | 0;0;0;128;0 |
139 | InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 14 14]] | 19.667 | 75264 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 18816 | 1066.67 | 0.00 | 44.30 | 17.64 | 4.70 | true | 0.439378;0.442912;0.442180;0.443484;0.442911 | 18816;18816;18816;18816;18816 | 0;0;128;0;0 | 9600;2432;384;384;384 |
140 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 14 14]] | 18 | 125440 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 43.80 | 0.00 | 0.00 | true | 0.438306;0.438310;0.437653;0.437746;0.438208 | 0;0;0;0;0 | 0;0;0;5376;0 | 128;128;0;0;0 |
141 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 16.667 | 100352 | 0 | 45099008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.30 | 0.00 | 0.00 | true | 0.432905;0.433074;0.432656;0.432746;0.432875 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 45.00 | 65888256 | 256.00 | 546997.33 | 12.50 | 120.40 | 1464.18 | false | 0.124872;0.124878;0.124868;0.124879;0.124872 | 65888256;65888256;65888256;65888256;65888256 | 2048;768;0;0;0 | 549856;541184;549952;550208;540704 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.67 | 0 | 1105920.00 | 635594.67 | 45.50 | 0.00 | 0.00 | true | 0.455331;0.452883;0.455153;0.455780;0.454128 | 0;0;0;0;0 | 1105920;1105920;1105920;1105920;1105920 | 643680;633472;631584;637952;635360 |
142 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 142 | 150528 | 4329216 | 45249536 | GPU_0_bfc | 4178688 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 1781760 | 0.00 | 1238549.33 | 15.00 | 1.44 | 356.35 | true | 0.149245;0.149443;0.149823;0.149668;0.149970 | 1781760;1781760;1781760;1781760;1781760 | 0;0;0;0;0 | 1235904;1243808;1237792;1239296;1238560 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 37.00 | 52813824 | 1536.00 | 251168.00 | 12.50 | 208.99 | 1427.40 | false | 0.124831;0.124839;0.124838;0.124844;0.124836 | 52813824;52813824;52813824;52813824;52813824 | 1664;1536;1536;1536;1536 | 253600;252576;247328;242080;256032 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 884736.00 | 422005.33 | 44.90 | 0.00 | 0.00 | true | 0.453399;0.447054;0.447300;0.447938;0.451433 | 0;0;0;0;0 | 884736;884736;884736;884736;884736 | 431232;423552;416352;416672;425792 |
143 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 133.667 | 150528 | 3493632 | 45274624 | GPU_0_bfc | 3343104 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 1425408 | 0.00 | 595274.67 | 13.10 | 2.39 | 356.35 | true | 0.131055;0.130890;0.130665;0.131333;0.131469 | 1425408;1425408;1425408;1425408;1425408 | 0;0;0;0;0 | 599200;593120;605056;593504;592960 |
144 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 25.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 298.67 | 44.60 | 35.28 | 9.41 | false | 0.443407;0.446120;0.445742;0.445688;0.446474 | 37632;37632;37632;37632;37632 | 768;768;768;768;768 | 384;384;256;256;256 |
145 | InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 20.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 9216.00 | 44.00 | 3.77 | 9.41 | true | 0.439734;0.439255;0.441039;0.440506;0.439502 | 37632;37632;37632;37632;37632 | 768;1792;768;768;768 | 8960;10240;9728;8832;8960 |
146 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 192 14 14]] | 18.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 11733.33 | 43.80 | 0.00 | 0.00 | true | 0.437880;0.438094;0.438340;0.436565;0.437499 | 0;0;0;0;0 | 0;0;0;0;0 | 12032;11264;11904;9984;14208 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 148 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 52.00 | 78962688 | 0.00 | 525578.67 | 12.50 | 150.24 | 1518.51 | false | 0.124904;0.124900;0.124897;0.124897;0.124891 | 78962688;78962688;78962688;78962688;78962688 | 0;0;0;0;0 | 528800;518848;526912;526144;523680 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 148 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1327168.00 | 68416.00 | 44.70 | 0.00 | 0.00 | true | 0.445494;0.449977;0.449317;0.446640;0.446040 | 0;0;0;0;0 | 63840;63968;75488;67584;73696 | 1327168;1327168;1327168;1327168;1327168 |
147 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 148 | 150528 | 5164800 | 45324800 | GPU_0_bfc | 5014272 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 2138112 | 0.00 | 1010869.33 | 17.40 | 2.12 | 427.62 | true | 0.174313;0.174277;0.173855;0.173885;0.173906 | 2138112;2138112;2138112;2138112;2138112 | 0;0;0;0;0 | 1014400;1012160;1011936;1006336;1008512 |
148 | InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 24.333 | 150528 | 0 | 45174272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 10496.00 | 44.60 | 3.34 | 9.41 | true | 0.444782;0.445719;0.446832;0.446444;0.446119 | 37632;37632;37632;37632;37632 | 768;2304;768;768;768 | 10496;9600;10368;10624;10752 |
150 | InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 576 14 14]] | 21.333 | 727552 | 0 | 45450240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 257642.67 | 58.50 | 0.00 | 0.00 | true | 0.584288;0.585085;0.584884;0.586177;0.586095 | 0;0;0;0;0 | 0;0;0;0;0 | 257728;257600;257504;257856;257600 |
151 | InceptionV2/InceptionV2/Mixed_5a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 576 7 7]] | 35.333 | 112896 | 112896 | 45563136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 28224 | 1024.00 | 113226.67 | 13.60 | 0.25 | 5.64 | true | 0.135744;0.135897;0.135715;0.136131;0.135847 | 28224;28224;28224;28224;28224 | 768;1024;1024;1024;1024 | 113312;113312;113184;113184;113184 |
152 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 122 | 150528 | 592896 | 45713664 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 49582848 | 0.00 | 324704.00 | 3.10 | 152.70 | 1271.36 | false | 0.031247;0.031248;0.031247;0.031248;0.031247 | 49582848;49582848;49582848;49582848;49582848 | 317696;324832;327904;325344;323936 | 0;0;0;0;0 |
152 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 14 14]] | 122 | 150528 | 592896 | 45713664 | GPU_0_bfc | 442368 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 442368.00 | 117322.67 | 45.50 | 0.00 | 0.00 | true | 0.454633;0.458970;0.453354;0.454427;0.456632 | 0;0;0;0;0 | 123776;116736;114304;117184;118048 | 442368;442368;442368;442368;442368 |
153 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 117.667 | 100352 | 395264 | 45814016 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 33055232 | 0.00 | 330.67 | 3.10 | 99965.32 | 847.57 | false | 0.031248;0.031248;0.031248;0.031247;0.031248 | 33055232;33055232;33055232;33055232;33055232 | 416;416;288;288;288 | 0;0;0;0;0 |
153 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 14 14]] | 117.667 | 100352 | 395264 | 45814016 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 294912.00 | 0.00 | 45.20 | 0.00 | 0.00 | true | 0.450813;0.454746;0.449933;0.450425;0.453625 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 0;0;128;0;0 |
154 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 14 14]] | 26.333 | 150528 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 37632 | 768.00 | 128.00 | 44.60 | 42.00 | 9.41 | false | 0.446722;0.446208;0.442810;0.446002;0.446008 | 37632;37632;37632;37632;37632 | 768;768;2816;768;768 | 128;128;128;128;256 |
155 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 14 14]] | 19 | 100352 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 25088 | 512.00 | 0.00 | 43.50 | 49.00 | 6.27 | false | 0.435633;0.434848;0.434513;0.435641;0.435530 | 25088;25088;25088;25088;25088 | 512;512;512;512;512 | 0;0;0;128;0 |
156 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 14 14]] | 18 | 150528 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.437704;0.438150;0.438303;0.438154;0.438159 | 0;0;0;0;0 | 0;0;0;0;12800 | 0;0;128;0;0 |
157 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 128 14 14]] | 17 | 100352 | 0 | 45086464 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 43.40 | 0.00 | 0.00 | true | 0.434927;0.434280;0.434417;0.434270;0.433948 | 0;0;0;0;0 | 1536;0;0;0;0 | 128;128;0;0;0 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 154.667 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 53.00 | 105283584 | 147893.33 | 745738.67 | 12.50 | 117.82 | 1986.48 | false | 0.124893;0.124895;0.124896;0.124895;0.124896 | 105283584;105283584;105283584;105283584;105283584 | 147424;146784;148960;155104;147296 | 764512;776256;730560;702560;742144 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 154.667 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 1769472.00 | 1352266.67 | 46.00 | 0.00 | 0.00 | true | 0.461201;0.461181;0.458973;0.459141;0.458466 | 0;0;0;0;0 | 1769472;1769472;1769472;1774848;1769472 | 1359488;1352864;1351648;1352288;1343840 |
158 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 14 14]] | 154.667 | 200704 | 6886400 | 45287168 | GPU_0_bfc | 6685696 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2850816 | 85.33 | 2636416.00 | 22.00 | 1.08 | 475.14 | true | 0.219914;0.220397;0.220304;0.220076;0.219618 | 2850816;2850816;2850816;2850816;2850816 | 64;128;320;64;64 | 2610752;2607264;2663840;2658880;2639616 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 166.333 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 72.00 | 28320960 | 192.00 | 139573.33 | 3.10 | 202.63 | 393.35 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 28320960;28320960;28320960;28320960;28320960 | 137568;148736;149120;132416;127648 | 192;192;1728;192;192 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 166.333 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 884736.00 | 894005.33 | 45.00 | 0.00 | 0.00 | true | 0.448998;0.446770;0.452870;0.457629;0.449084 | 0;0;0;0;0 | 884736;885504;884736;884736;884736 | 894560;887008;879552;900448;901216 |
159 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 166.333 | 37632 | 1037568 | 45174272 | GPU_0_bfc | 999936 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4.00 | 0 | 101205.33 | 13440.00 | 46.90 | 0.00 | 0.00 | true | 0.464219;0.471580;0.471101;0.465587;0.469545 | 0;0;0;0;0 | 98304;98688;104576;100480;104448 | 13120;13344;13376;13600;15776 |
160 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 14 14]] | 25.667 | 200704 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 50176 | 1024.00 | 128.00 | 45.00 | 43.56 | 12.54 | false | 0.449221;0.451108;0.450068;0.450701;0.449820 | 50176;50176;50176;50176;50176 | 1024;1024;4096;1024;1024 | 128;128;256;128;128 |
161 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 21 | 37632 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 9408 | 768.00 | 42.67 | 41.70 | 11.61 | 2.35 | true | 0.418433;0.415659;0.415076;0.416713;0.421331 | 9408;9408;9408;9408;9408 | 768;768;768;768;768 | 128;128;0;0;0 |
162 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 256 14 14]] | 18.333 | 200704 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438832;0.439246;0.438846;0.438341;0.438216 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
163 | InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 7 7]] | 17.667 | 37632 | 0 | 45073920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 42.80 | 0.00 | 0.00 | true | 0.428481;0.427720;0.427871;0.428735;0.427942 | 0;0;0;0;0 | 0;0;0;128;0 | 0;0;0;0;0 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 232 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 139.00 | 75510016 | 0.00 | 67712.00 | 3.10 | 1115.16 | 543.24 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75510016;75510016;75510016;75510016;75510016 | 0;0;768;0;0 | 70144;66432;74112;66560;60416 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 232 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 19.00 | 0 | 2359296.00 | 1607850.67 | 45.90 | 0.00 | 0.00 | true | 0.459584;0.458743;0.458546;0.460302;0.459093 | 0;0;0;0;0 | 1604992;1609120;1601120;1609440;1614688 | 2360064;2359296;2359296;2359296;2359296 |
164 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 7 7]] | 232 | 50176 | 2639872 | 45124096 | GPU_0_bfc | 2589696 | 0 | 0 | 0 | void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4.00 | 0 | 0.00 | 1536.00 | 47.60 | 0.00 | 0.00 | true | 0.476149;0.475852;0.475616;0.475874;0.476131 | 0;0;0;0;0 | 0;0;0;0;0 | 1536;1536;1792;1536;1536 |
165 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 7 7]] | 25.333 | 50176 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12544 | 1024.00 | 128.00 | 42.70 | 10.89 | 3.14 | true | 0.428557;0.425318;0.428107;0.427649;0.424953 | 12544;12544;12544;12544;12544 | 1024;1024;1024;1024;1024 | 128;128;128;256;128 |
166 | InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 256 7 7]] | 18.667 | 50176 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.20 | 0.00 | 0.00 | true | 0.431990;0.432413;0.432686;0.431344;0.432350 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;128;0;0 |
168 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 7 7]] | 37 | 200704 | 200704 | 45124096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 1087642 | 512.00 | 61621.33 | 21.70 | 17.50 | 181.27 | true | 0.216780;0.216229;0.216898;0.216596;0.217152 | 1087642;1087642;1087642;1087642;1087642 | 512;512;512;512;512 | 60000;61536;62880;62176;61152 |
169 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 7 7]] | 140.667 | 31488 | 686848 | 45155584 | GPU_0_bfc | 655360 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 65.00 | 20979360 | 0.00 | 189056.00 | 3.10 | 110.97 | 322.76 | false | 0.031248;0.031249;0.031249;0.031249;0.031249 | 20979360;20979360;20979360;20979360;20979360 | 0;0;0;0;0 | 186336;192032;188000;189760;189408 |
169 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 7 7]] | 140.667 | 31488 | 686848 | 45155584 | GPU_0_bfc | 655360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.33 | 0 | 655360.00 | 238570.67 | 43.60 | 0.00 | 0.00 | true | 0.439472;0.432982;0.425089;0.435328;0.444775 | 0;0;0;0;0 | 655360;655360;655616;655360;655360 | 241440;237088;238784;238752;238176 |
170 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 142 | 37632 | 824064 | 45193216 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.67 | 25175232 | 0.00 | 18282.67 | 3.10 | 1377.00 | 389.31 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 0;0;0;0;0 | 18112;21568;16608;17216;19520 |
170 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 142 | 37632 | 824064 | 45193216 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 786432.00 | 20384.00 | 44.00 | 0.00 | 0.00 | true | 0.432378;0.441956;0.442978;0.437919;0.441580 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 20128;18592;21536;20896;20128 |
171 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 145.333 | 69120 | 1510912 | 45262336 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 65.00 | 46154592 | 0.00 | 51434.67 | 3.10 | 897.34 | 710.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 46154592;46154592;46154592;46154592;46154592 | 0;0;0;0;0 | 49984;51008;51648;51648;56896 |
171 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 145.333 | 69120 | 1510912 | 45262336 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1441792.00 | 173504.00 | 43.00 | 0.00 | 0.00 | true | 0.430915;0.428886;0.427260;0.430094;0.430415 | 0;0;0;0;0 | 1441792;1441792;1441792;1441792;1441792 | 176544;171520;174208;174784;168096 |
172 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 140.667 | 25088 | 549376 | 45086720 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 16783488 | 0.00 | 4810.67 | 3.10 | 3488.81 | 262.24 | false | 0.031249;0.031249;0.031248;0.031249;0.031249 | 16783488;16783488;16783488;16783488;16783488 | 20512;544;544;1056;12832 | 0;0;0;0;0 |
172 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 140.667 | 25088 | 549376 | 45086720 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 524288.00 | 312661.33 | 43.00 | 0.00 | 0.00 | true | 0.424742;0.429933;0.430580;0.430872;0.434853 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 309632;308096;313728;315648;314624 |
173 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 7 7]] | 26 | 31488 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 7840 | 640.00 | 1920.00 | 45.60 | 3.06 | 1.96 | true | 0.454613;0.457739;0.455424;0.457754;0.451533 | 7840;7840;7840;7840;7840 | 640;640;640;640;1152 | 3072;5376;0;0;2688 |
174 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 19.667 | 37632 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 9408 | 768.00 | 1194.67 | 41.70 | 4.79 | 2.35 | true | 0.420025;0.414949;0.418951;0.415387;0.416235 | 9408;9408;9408;9408;9408 | 768;768;768;768;768 | 2048;768;768;768;2048 |
175 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 352 7 7]] | 20 | 69120 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 17248 | 1408.00 | 42.67 | 43.30 | 11.89 | 4.31 | true | 0.431690;0.429969;0.433383;0.432522;0.434033 | 17248;17248;17248;17248;17248 | 1408;1408;1408;1408;1408 | 1536;0;0;128;0 |
176 | InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 7 7]] | 18.333 | 25088 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 6272 | 512.00 | 341.33 | 41.60 | 7.35 | 1.57 | true | 0.415809;0.414862;0.413536;0.416277;0.419004 | 6272;6272;6272;6272;6272 | 512;512;512;9216;512 | 0;256;384;18176;384 |
177 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 7 7]] | 18.333 | 31488 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1450.67 | 3072.00 | 44.10 | 0.00 | 0.00 | true | 0.441348;0.440932;0.440710;0.440935;0.440994 | 0;0;0;0;0 | 0;4352;0;0;5376 | 3584;384;128;5248;6144 |
178 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 17.667 | 37632 | 0 | 44886016 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 4010.67 | 42.80 | 0.00 | 0.00 | true | 0.427989;0.427735;0.428273;0.427694;0.427981 | 0;0;0;0;0 | 0;0;0;0;0 | 0;2560;9472;10752;0 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 144.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 44.00 | 38434816 | 85.33 | 438432.00 | 12.50 | 87.65 | 873.52 | false | 0.124921;0.124914;0.124918;0.124920;0.124922 | 38434816;38434816;38434816;38434816;38434816 | 256;0;0;5632;0 | 439072;438688;435488;437536;439584 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 144.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1290240.00 | 407136.00 | 44.80 | 0.00 | 0.00 | true | 0.447865;0.449489;0.451246;0.443110;0.447339 | 0;0;0;0;0 | 1290240;1290240;1290240;1290240;1290240 | 407584;427264;418304;395520;391552 |
179 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 144.333 | 44032 | 4919296 | 44930048 | GPU_0_bfc | 4875264 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 2078720 | 0.00 | 965354.67 | 17.30 | 2.15 | 415.74 | true | 0.172620;0.172715;0.172964;0.172699;0.172833 | 2078720;2078720;2078720;2078720;2078720 | 0;0;0;0;0 | 959392;979968;970624;964960;960480 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 156.333 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 53.00 | 65802240 | 37856.00 | 557920.00 | 12.50 | 110.45 | 1241.55 | false | 0.124933;0.124930;0.124932;0.124932;0.124932 | 65802240;65802240;65802240;65802240;65802240 | 37856;37856;37856;37856;37856 | 545408;563968;581920;556288;553504 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 156.333 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.33 | 0 | 2212245.33 | 1964896.00 | 47.20 | 0.00 | 0.00 | true | 0.470582;0.468768;0.475654;0.473609;0.472875 | 0;0;0;0;0 | 2211904;2213440;2211904;2211904;2212928 | 1969280;1956384;1966496;1958912;1972000 |
180 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 156.333 | 62720 | 8419840 | 44961280 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.00 | 3563520 | 12394.67 | 3300458.67 | 25.00 | 1.08 | 509.07 | true | 0.248532;0.251792;0.246569;0.252785;0.251174 | 3563520;3563520;3563520;3563520;3563520 | 13632;12608;11200;13376;9920 | 3319232;3284448;3248352;3311232;3305696 |
181 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 25.333 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 10976 | 43840.00 | 650.67 | 45.10 | 0.25 | 2.20 | true | 0.450797;0.450645;0.450148;0.451314;0.452104 | 10976;10976;10976;10976;10976 | 43712;43968;44096;43584;43840 | 576;1088;704;576;672 |
182 | InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 7 7]] | 20 | 62720 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 15680 | 1280.00 | 9216.00 | 45.20 | 1.49 | 3.92 | true | 0.451389;0.452715;0.451581;0.451716;0.452257 | 15680;15680;15680;15680;15680 | 1280;1280;1280;3328;1280 | 9088;9216;9344;9088;9344 |
183 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 224 7 7]] | 17.667 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 35168.00 | 43.40 | 0.00 | 0.00 | true | 0.433827;0.433830;0.434144;0.433380;0.433708 | 0;0;0;0;0 | 0;0;0;0;0 | 35296;34784;35040;35296;35168 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 159 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 60.33 | 53688320 | 42901.33 | 593568.00 | 12.50 | 84.35 | 889.87 | false | 0.124939;0.124941;0.124939;0.124939;0.124939 | 53688320;53688320;53688320;53688320;53688320 | 573344;599072;604192;584864;596768 | 43712;43584;42048;43072;41664 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 159 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 1806336.00 | 1698538.67 | 46.20 | 0.00 | 0.00 | true | 0.461920;0.465220;0.459972;0.462087;0.461995 | 0;0;0;0;0 | 1806336;1806336;1806336;1806336;1806336 | 1695456;1694912;1713248;1700608;1699552 |
184 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 159 | 44032 | 6868992 | 44967680 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2910208 | 0.00 | 2752512.00 | 22.00 | 1.06 | 485.03 | true | 0.219783;0.219841;0.218830;0.220950;0.219523 | 2910208;2910208;2910208;2910208;2910208 | 2773728;2748736;2719840;2761056;2747744 | 0;0;0;0;0 |
185 | InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 24 | 44032 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 10976 | 1024.00 | 640.00 | 45.00 | 6.60 | 2.74 | true | 0.450831;0.450727;0.449901;0.455148;0.449506 | 10976;10976;10976;10976;10976 | 1024;1024;1024;1280;1024 | 640;640;640;768;640 |
187 | InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 7 7]] | 21 | 294656 | 0 | 45017344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 172074.67 | 43.70 | 0.00 | 0.00 | true | 0.436938;0.436561;0.436712;0.436861;0.437202 | 0;0;0;0;0 | 7680;0;0;0;0 | 171904;172160;172288;171776;172160 |
188 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool | MaxPool | [[1 1024 7 7]] | 35.667 | 200704 | 200704 | 45218048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 50176 | 768.00 | 200661.33 | 22.60 | 0.25 | 10.04 | true | 0.225991;0.226172;0.225589;0.226251;0.224993 | 50176;50176;50176;50176;50176 | 256;768;768;768;768 | 200832;200320;200192;200960;200832 |
189 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 144 | 37632 | 824064 | 45255680 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.67 | 25175232 | 0.00 | 284896.00 | 3.10 | 88.37 | 389.31 | false | 0.031249;0.031249;0.031248;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 0;0;0;0;0 | 278112;288608;288352;285024;281312 |
189 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 144 | 37632 | 824064 | 45255680 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 786432.00 | 522858.67 | 43.40 | 0.00 | 0.00 | true | 0.435005;0.436824;0.435039;0.433094;0.424403 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 526912;517984;515200;523744;526848 |
190 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 134.667 | 37632 | 824064 | 45293312 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.33 | 25175232 | 512.00 | 37621.33 | 3.10 | 660.19 | 391.33 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 25175232;25175232;25175232;25175232;25175232 | 256;0;0;1280;2048 | 37664;37664;38048;37536;37536 |
190 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 7 7]] | 134.667 | 37632 | 824064 | 45293312 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 786432.00 | 938.67 | 44.00 | 0.00 | 0.00 | true | 0.439506;0.444022;0.439666;0.441150;0.428203 | 0;0;0;0;0 | 640;768;1024;1024;1536 | 786432;786432;786432;786432;786432 |
191 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 144.667 | 100608 | 1542400 | 45393920 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 65.00 | 46154592 | 0.00 | 215957.33 | 3.10 | 213.72 | 710.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 46154592;46154592;46154592;46154592;46154592 | 0;0;0;0;0 | 214592;212288;222304;220992;208064 |
191 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 352 7 7]] | 144.667 | 100608 | 1542400 | 45393920 | GPU_0_bfc | 1441792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1441792.00 | 557621.33 | 43.50 | 0.00 | 0.00 | true | 0.439906;0.433182;0.426858;0.434761;0.435913 | 0;0;0;0;0 | 558240;559840;551072;554784;563488 | 1441792;1441792;1441792;1441792;1442048 |
192 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 139.667 | 25088 | 549376 | 45124352 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 16783488 | 85.33 | 0.00 | 3.10 | 196682.27 | 262.24 | false | 0.031249;0.031248;0.031248;0.031248;0.031249 | 16783488;16783488;16783488;16783488;16783488 | 128;128;0;6656;0 | 0;0;0;11008;0 |
192 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 7 7]] | 139.667 | 25088 | 549376 | 45124352 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 524288.00 | 299349.33 | 42.70 | 0.00 | 0.00 | true | 0.427663;0.423549;0.425078;0.427038;0.433133 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 288768;307456;285824;305024;304256 |
193 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 25.667 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 9408 | 1024.00 | 725.33 | 43.30 | 5.38 | 2.35 | true | 0.432159;0.433447;0.433420;0.423293;0.432998 | 9408;9408;9408;9408;9408 | 1024;1024;1024;1280;1024 | 896;384;768;640;768 |
194 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 7 7]] | 20.333 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 9408 | 768.00 | 256.00 | 41.90 | 9.19 | 2.35 | true | 0.417750;0.418369;0.420036;0.413757;0.420284 | 9408;9408;9408;9408;9408 | 768;768;768;768;768 | 256;512;256;128;256 |
195 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 352 7 7]] | 20 | 100608 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 17248 | 1408.00 | 1376.00 | 43.40 | 6.20 | 4.31 | true | 0.433468;0.434002;0.435015;0.433647;0.437059 | 17248;17248;17248;17248;17248 | 1280;1312;1408;1408;1408 | 1408;1408;1408;1408;1408 |
196 | InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 7 7]] | 19.333 | 25088 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 6272 | 512.00 | 0.00 | 41.90 | 12.25 | 1.57 | true | 0.415356;0.419412;0.418997;0.417387;0.419880 | 6272;6272;6272;6272;6272 | 512;512;2048;512;512 | 0;0;0;0;0 |
197 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 17.333 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 42.90 | 0.00 | 0.00 | true | 0.429292;0.428917;0.428900;0.429044;0.428774 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
198 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 7 7]] | 17.333 | 37632 | 0 | 44923648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 426.67 | 42.80 | 0.00 | 0.00 | true | 0.427931;0.427940;0.427856;0.427618;0.427945 | 0;0;0;0;0 | 0;768;0;0;2304 | 0;1280;0;0;1536 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151.333 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 52.00 | 46061568 | 938.67 | 446112.00 | 12.50 | 103.03 | 885.80 | false | 0.124933;0.124932;0.124931;0.124931;0.124931 | 46061568;46061568;46061568;46061568;46061568 | 256;512;16384;1024;1280 | 444320;445600;453152;447520;445216 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151.333 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1548288.00 | 394464.00 | 45.00 | 0.00 | 0.00 | true | 0.450855;0.449451;0.449585;0.449728;0.449505 | 0;0;0;0;0 | 1548288;1548288;1548288;1548288;1548288 | 405632;387840;408704;378528;389920 |
199 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 151.333 | 44032 | 5894144 | 44967680 | GPU_0_bfc | 5850112 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2494464 | 64.00 | 1368128.00 | 19.70 | 1.82 | 415.74 | true | 0.196977;0.197000;0.196631;0.197379;0.196837 | 2494464;2494464;2494464;2494464;2494464 | 64;64;64;64;64 | 1369056;1367808;1359840;1376576;1367520 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 159.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 53.00 | 65802240 | 37856.00 | 637920.00 | 12.50 | 97.37 | 1241.55 | false | 0.124932;0.124931;0.124930;0.124933;0.124931 | 65802240;65802240;65802240;65802240;65802240 | 37856;37856;37856;37856;37856 | 647168;643616;605024;649632;622976 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 159.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.00 | 0 | 2211840.00 | 2112437.33 | 47.30 | 0.00 | 0.00 | true | 0.472851;0.471559;0.470905;0.474390;0.474311 | 0;0;0;0;0 | 2211840;2211840;2211840;2211840;2211840 | 2109952;2114048;2113312;2108832;2115360 |
200 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 7 7]] | 159.667 | 62720 | 8419840 | 44992768 | GPU_0_bfc | 8357120 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 8.00 | 3563520 | 11285.33 | 3404949.33 | 25.20 | 1.04 | 445.44 | true | 0.252977;0.250743;0.253178;0.251042;0.253947 | 3563520;3563520;3563520;3563520;3563520 | 13568;14656;10752;5376;9536 | 3405664;3384384;3450560;3360544;3424800 |
201 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 25.667 | 44032 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 10976 | 8064.00 | 416.00 | 44.90 | 1.29 | 2.74 | true | 0.451781;0.449786;0.448777;0.442076;0.449933 | 10976;10976;10976;10976;10976 | 7424;11264;8192;7296;8576 | 416;416;416;416;544 |
202 | InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 7 7]] | 19.333 | 62720 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 15680 | 1280.00 | 1877.33 | 45.20 | 4.97 | 3.92 | true | 0.452081;0.452346;0.452437;0.451483;0.450316 | 15680;15680;15680;15680;15680 | 1280;1280;1280;1280;1280 | 2048;2048;2048;1536;1408 |
203 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 224 7 7]] | 17.667 | 44032 | 0 | 44955136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42368.00 | 43.30 | 0.00 | 0.00 | true | 0.433757;0.433059;0.433651;0.432976;0.433547 | 0;0;0;0;0 | 0;0;0;0;0 | 42368;42368;42496;42368;42368 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 161.333 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 60.00 | 53688320 | 43413.33 | 553514.67 | 12.50 | 89.94 | 894.81 | false | 0.124940;0.124940;0.124942;0.124940;0.124941 | 53688320;53688320;53688320;53688320;53688320 | 43456;42432;43584;43200;43840 | 555808;576288;556864;541344;547872 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 161.333 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 1806336.00 | 1700362.67 | 46.30 | 0.00 | 0.00 | true | 0.462455;0.460564;0.465052;0.465428;0.462058 | 0;0;0;0;0 | 1806336;1806336;1806336;1806336;1806336 | 1705600;1706656;1695776;1698720;1696768 |
204 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 7 7]] | 161.333 | 75264 | 6900224 | 45030400 | GPU_0_bfc | 6824960 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 2910208 | 0.00 | 2794602.67 | 21.90 | 1.04 | 485.03 | true | 0.219569;0.219752;0.219193;0.218704;0.217222 | 2910208;2910208;2910208;2910208;2910208 | 0;128;0;0;0 | 2788736;2763776;2792000;2803072;2807808 |
205 | InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 7 7]] | 24.667 | 75264 | 0 | 44986368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 10976 | 1024.00 | 640.00 | 45.00 | 6.60 | 2.74 | true | 0.463105;0.451504;0.448183;0.450220;0.448886 | 10976;10976;10976;10976;10976 | 1024;1024;1024;1024;1024 | 640;640;768;640;640 |
207 | InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 7 7]] | 21.333 | 200704 | 0 | 44923392 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 176853.33 | 43.70 | 0.00 | 0.00 | true | 0.437247;0.436694;0.437002;0.437633;0.437623 | 0;0;0;0;0 | 0;0;0;0;0 | 155904;177408;177280;178944;175872 |
208 | InceptionV2/Logits/AvgPool_1a_7x7/AvgPool | AvgPool | [[1 1024 1 1]] | 37.667 | 4096 | 4096 | 44927488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 72074 | 4010.67 | 768.00 | 10.70 | 15.08 | 10.30 | true | 0.106926;0.106918;0.106847;0.107003;0.106860 | 72074;72074;72074;72074;72074 | 1792;7936;2048;2048;16896 | 896;1024;640;640;768 |
209 | InceptionV2/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 174 | 4096 | 4104192 | 44730880 | GPU_0_bfc | 4100096 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.33 | 67109865 | 1299754.67 | 452746.67 | 3.10 | 38.29 | 902.83 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 67109865;67109865;67109865;67109865;67109865 | 1287424;1284224;1318016;1305472;1306368 | 451680;455520;450784;456928;451040 |
209 | InceptionV2/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 174 | 4096 | 4104192 | 44730880 | GPU_0_bfc | 4100096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 29.33 | 0 | 4159104.00 | 3759733.33 | 46.60 | 0.00 | 0.00 | true | 0.468473;0.469413;0.462552;0.466814;0.462719 | 0;0;0;0;0 | 4159488;4160640;4158720;4158720;4159104 | 3754016;3753760;3768864;3757600;3767584 |
210 | InceptionV2/Logits/Conv2d_1c_1x1/BiasAdd | BiasAdd | [[1 1001 1 1]] | 26.333 | 4096 | 0 | 44726784 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 3.00 | 1001 | 4256.00 | 170.67 | 46.40 | 0.23 | 0.33 | true | 0.465426;0.477601;0.463371;0.462629;0.446969 | 1001;1001;1001;1001;1001 | 4256;4256;4256;4256;4256 | 256;256;128;128;128 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 59.333 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 8.00 | 10431 | 1962.67 | 128.00 | 2.30 | 4.99 | 1.30 | true | 0.022526;0.022837;0.022472;0.022891;0.022782 | 10431;10431;10431;10431;10431 | 128;128;128;128;128 | 1280;2560;1536;1792;3072 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 59.333 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 4.00 | 0 | 1450.67 | 256.00 | 4.00 | 0.00 | 0.00 | true | 0.039425;0.040707;0.039706;0.037026;0.040636 | 0;0;0;0;0 | 1536;1536;1280;3072;1280 | 128;256;256;256;256 |
214 | InceptionV2/Predictions/Softmax | Softmax | [[1 1001]] | 59.333 | 4096 | 8192 | 44726784 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 3.00 | 24024 | 2048.00 | 42.67 | 6.20 | 11.49 | 8.01 | true | 0.062198;0.062251;0.062245;0.062198;0.062250 | 24024;24024;24024;24024;24024 | 1280;2048;2048;2048;2048 | 128;0;0;0;256 |
Showing 1 to 309 of 309 entries