GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV3/InceptionV3/Conv2d_1a_3x3/BatchNorm/batchnorm/mul-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[1 3 299 299]] | 83.667 | 1072896 | 1072896 | 97428224 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 8.00 | 0 | 6741.33 | 125173.33 | 85.40 | 0.00 | 0.00 | true | 0.858394;0.866036;0.860190;0.836610;0.844883 | 0;0;0;0;0 | 6912;6912;6656;6656;6656 | 124768;125792;124960;120928;127872 |
2 | InceptionV3/InceptionV3/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 193 | 2841856 | 2845440 | 99197184 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.00 | 46192416 | 154026.67 | 2718965.33 | 18.80 | 16.08 | 2566.25 | true | 0.188356;0.187813;0.188153;0.187936;0.187946 | 46192416;46192416;46192416;46192416;46192416 | 159552;145472;157056;138944;173952 | 2726592;2706368;2723936;2688000;2755232 |
2 | InceptionV3/InceptionV3/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 193 | 2841856 | 2845440 | 99197184 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 3712.00 | 170.67 | 37.90 | 0.00 | 0.00 | true | 0.379344;0.380951;0.379360;0.379080;0.378715 | 0;0;0;0;0 | 3712;3712;3712;3712;3712 | 256;256;128;128;128 |
3 | InceptionV3/InceptionV3/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 149 149]] | 40 | 2841856 | 0 | 98124288 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.33 | 710432 | 768.00 | 288160.00 | 79.10 | 2.46 | 112.18 | true | 0.789819;0.790395;0.793421;0.791717;0.785028 | 710432;710432;710432;710432;710432 | 768;768;896;768;768 | 274528;300288;289664;301792;251840 |
4 | InceptionV3/InceptionV3/Conv2d_1a_3x3/Relu | Relu | [[1 32 149 149]] | 26.667 | 2841856 | 0 | 98124288 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 512.00 | 197557.33 | 69.00 | 0.00 | 0.00 | true | 0.690549;0.691157;0.690128;0.690288;0.689828 | 0;0;0;0;0 | 200224;213920;181408;197888;194560 | 512;512;512;512;512 |
5 | InceptionV3/InceptionV3/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 163 | 2766080 | 2905600 | 100890368 | GPU_0_bfc | 139520 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 28.00 | 215183360 | 361557.33 | 3347146.67 | 21.80 | 58.02 | 7685.12 | false | 0.212803;0.219969;0.214910;0.218913;0.226783 | 215183360;215183360;215183360;215183360;215183360 | 3244576;3572256;3360064;3403136;3278240 | 271008;544064;372288;422432;289952 |
5 | InceptionV3/InceptionV3/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 163 | 2766080 | 2905600 | 100890368 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 37034.67 | 25216.00 | 42.20 | 0.00 | 0.00 | true | 0.421372;0.422683;0.422539;0.421250;0.424089 | 0;0;0;0;0 | 27904;23424;27008;23168;25216 | 36864;36864;37120;37120;37120 |
5 | InceptionV3/InceptionV3/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 163 | 2766080 | 2905600 | 100890368 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 59392 | 256.00 | 54357.33 | 6.20 | 1.09 | 14.85 | true | 0.062366;0.062356;0.062362;0.062359;0.062360 | 59392;59392;59392;59392;59392 | 256;256;256;256;2816 | 54528;47872;55424;56960;53120 |
6 | InceptionV3/InceptionV3/Conv2d_2a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 147 147]] | 31 | 2766080 | 0 | 98048512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.33 | 691488 | 256.00 | 261578.67 | 78.00 | 2.64 | 109.19 | true | 0.781499;0.780910;0.782401;0.778443;0.776898 | 691488;691488;691488;691488;691488 | 256;256;256;7008;256 | 325728;137920;243872;215136;352832 |
7 | InceptionV3/InceptionV3/Conv2d_2a_3x3/Relu | Relu | [[1 32 147 147]] | 23.333 | 2766080 | 0 | 98048512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 0.00 | 67061.33 | 67.30 | 0.00 | 0.00 | true | 0.673945;0.673801;0.670753;0.674472;0.670575 | 0;0;0;0;0 | 76096;48480;68512;56576;80576 | 0;6912;0;0;0 |
8 | InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 149.667 | 5531904 | 5810688 | 103580416 | GPU_0_bfc | 278784 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 47.33 | 430366720 | 2131882.67 | 7660981.33 | 22.90 | 43.95 | 9092.32 | false | 0.227770;0.230401;0.226433;0.230065;0.231043 | 430366720;430366720;430366720;430366720;430366720 | 2089088;2240096;2105856;2200704;2037728 | 7592288;7734432;7660736;7729920;7555872 |
8 | InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 149.667 | 5531904 | 5810688 | 103580416 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 73728.00 | 8554.67 | 42.30 | 0.00 | 0.00 | true | 0.422591;0.422550;0.425106;0.421354;0.423146 | 0;0;0;0;0 | 73728;73728;73728;73728;73728 | 9024;9664;7104;6208;9536 |
8 | InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 149.667 | 5531904 | 5810688 | 103580416 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.33 | 118784 | 0.00 | 73866.67 | 6.20 | 1.61 | 35.64 | true | 0.062305;0.062312;0.062309;0.062310;0.062309 | 118784;118784;118784;118784;118784 | 0;256;0;0;0 | 73440;74336;73952;73568;74080 |
9 | InceptionV3/InceptionV3/Conv2d_2b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 64 147 147]] | 34.667 | 5531904 | 0 | 100814336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 11.00 | 1382976 | 1611818.67 | 3239584.00 | 83.10 | 0.29 | 125.73 | true | 0.842225;0.822217;0.804874;0.832246;0.839859 | 1382976;1382976;1382976;1382976;1382976 | 1647424;1710976;1582944;1564576;1605088 | 3237664;3249856;3231232;3226976;3316096 |
10 | InceptionV3/InceptionV3/Conv2d_2b_3x3/Relu | Relu | [[1 64 147 147]] | 26 | 5531904 | 0 | 100814336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 8.00 | 0 | 138.67 | 4783968.00 | 74.40 | 0.00 | 0.00 | true | 0.745854;0.739224;0.733703;0.745492;0.760762 | 0;0;0;0;0 | 160;96;160;96;352 | 4858112;4744416;4837440;4770048;4631360 |
11 | InceptionV3/InceptionV3/MaxPool_3a_3x3/MaxPool | MaxPool | [[1 64 73 73]] | 59 | 1364224 | 1364224 | 102178560 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 341056 | 307562.67 | 2056042.67 | 61.10 | 0.14 | 28.42 | true | 0.615364;0.611277;0.612033;0.610318;0.609039 | 341056;341056;341056;341056;341056 | 319872;306048;311232;304192;305408 | 2063840;2102272;2054080;2028768;2050208 |
12 | InceptionV3/InceptionV3/Conv2d_3b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 80 73 73]] | 117.333 | 1705472 | 1725952 | 98352128 | GPU_0_bfc | 20480 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.00 | 66093392 | 203221.33 | 1803680.00 | 14.90 | 32.93 | 3671.86 | false | 0.149051;0.149912;0.147595;0.148865;0.148158 | 66093392;66093392;66093392;66093392;66093392 | 192128;195968;200576;286976;213120 | 1794656;1758208;1804608;1912960;1811776 |
12 | InceptionV3/InceptionV3/Conv2d_3b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 80 73 73]] | 117.333 | 1705472 | 1725952 | 98352128 | GPU_0_bfc | 20480 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 20544.00 | 2389.33 | 43.70 | 0.00 | 0.00 | true | 0.435574;0.438738;0.441262;0.435138;0.438054 | 0;0;0;0;0 | 20544;20672;20544;20544;20544 | 2048;3648;896;2688;2432 |
13 | InceptionV3/InceptionV3/Conv2d_3b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 80 73 73]] | 29 | 1705472 | 0 | 96987904 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 426320 | 576.00 | 94890.67 | 66.20 | 4.47 | 85.26 | true | 0.672150;0.659555;0.661339;0.664300;0.661021 | 426320;426320;426320;426320;426320 | 576;576;576;5696;576 | 94976;96672;96576;85536;93120 |
14 | InceptionV3/InceptionV3/Conv2d_3b_1x1/Relu | Relu | [[1 80 73 73]] | 22 | 1705472 | 0 | 96987904 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 1792.00 | 63.60 | 0.00 | 0.00 | true | 0.635546;0.636211;0.635137;0.637612;0.635476 | 0;0;0;0;0 | 0;0;0;0;1024 | 1856;1728;1472;2560;1792 |
15 | InceptionV3/InceptionV3/Conv2d_4a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 71 71]] | 182 | 3871488 | 5961216 | 100859392 | GPU_0_bfc | 2089728 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 73.33 | 747048960 | 1482400.00 | 6329397.33 | 22.70 | 95.63 | 10187.08 | false | 0.231990;0.224667;0.221290;0.229150;0.227202 | 747048960;747048960;747048960;747048960;747048960 | 1433088;1499104;1578560;1515008;1373664 | 6263072;6379680;6340448;6358816;6288928 |
15 | InceptionV3/InceptionV3/Conv2d_4a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 71 71]] | 182 | 3871488 | 5961216 | 100859392 | GPU_0_bfc | 2089728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 552960.00 | 328597.33 | 45.60 | 0.00 | 0.00 | true | 0.457584;0.456479;0.456823;0.456087;0.452930 | 0;0;0;0;0 | 552960;552960;552960;552960;552960 | 325248;330720;347584;329472;325600 |
15 | InceptionV3/InceptionV3/Conv2d_4a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 71 71]] | 182 | 3871488 | 5961216 | 100859392 | GPU_0_bfc | 2089728 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 890880 | 0.00 | 694688.00 | 9.20 | 1.28 | 222.72 | true | 0.091542;0.091731;0.091645;0.091965;0.091800 | 890880;890880;890880;890880;890880 | 0;0;0;256;0 | 695776;695808;700224;690048;692480 |
16 | InceptionV3/InceptionV3/Conv2d_4a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 71 71]] | 30.667 | 3871488 | 0 | 99153920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 7.00 | 967872 | 186464.00 | 1435690.67 | 81.40 | 0.60 | 138.27 | true | 0.819906;0.813517;0.816091;0.804847;0.812467 | 967872;967872;967872;967872;967872 | 1473664;1436704;1326784;1524832;1396704 | 181856;204992;128864;199744;177792 |
17 | InceptionV3/InceptionV3/Conv2d_4a_3x3/Relu | Relu | [[1 192 71 71]] | 22.667 | 3871488 | 0 | 99153920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 6.00 | 0 | 96.00 | 2250186.67 | 75.10 | 0.00 | 0.00 | true | 0.743761;0.749805;0.752057;0.753499;0.750110 | 0;0;0;0;0 | 96;96;96;96;96 | 2253408;2276928;2293856;2166880;2220224 |
18 | InceptionV3/InceptionV3/MaxPool_5a_3x3/MaxPool | MaxPool | [[1 192 35 35]] | 42 | 940800 | 940800 | 100094720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 235200 | 42.67 | 1263008.00 | 55.70 | 0.19 | 29.40 | true | 0.558108;0.557278;0.556842;0.556261;0.556253 | 235200;235200;235200;235200;235200 | 64;0;1024;0;64 | 1269184;1264192;1242016;1282912;1255648 |
19 | InceptionV3/InceptionV3/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 192 35 35]] | 42.667 | 940800 | 940800 | 97164032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 10.00 | 6217863 | 1856.00 | 932960.00 | 56.20 | 6.65 | 621.79 | true | 0.564177;0.561338;0.561132;0.562728;0.561253 | 6217863;6217863;6217863;6217863;6217863 | 2112;1856;1856;1856;1856 | 933376;931136;933696;935136;931808 |
20 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 100.333 | 313600 | 362752 | 97477632 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 30749248 | 2048.00 | 313269.33 | 3.10 | 97.52 | 2196.37 | false | 0.031213;0.031214;0.031213;0.031213;0.031215 | 30749248;30749248;30749248;30749248;30749248 | 2048;2048;2048;2048;2048 | 311296;313056;315904;314688;312064 |
20 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 100.333 | 313600 | 362752 | 97477632 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 629.33 | 41.20 | 0.00 | 0.00 | true | 0.412262;0.412192;0.412179;0.412044;0.413120 | 0;0;0;0;0 | 49152;51200;49152;49152;49152 | 800;544;672;672;544 |
21 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 96.333 | 235264 | 272128 | 97712896 | GPU_0_bfc | 36864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 30729648 | 64.00 | 230602.67 | 3.10 | 133.22 | 2194.97 | false | 0.031129;0.031129;0.031129;0.031123;0.031130 | 30729648;30729648;30729648;30729648;30729648 | 128;64;64;64;64 | 232832;228128;230048;231200;230560 |
21 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 96.333 | 235264 | 272128 | 97712896 | GPU_0_bfc | 36864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 36864.00 | 7530.67 | 43.40 | 0.00 | 0.00 | true | 0.435841;0.430620;0.431078;0.436288;0.435514 | 0;0;0;0;0 | 36864;36864;36864;36864;36864 | 6720;7936;7104;7552;8512 |
22 | InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 94.667 | 313600 | 362752 | 98026496 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 30749248 | 0.00 | 298805.33 | 3.10 | 102.91 | 2196.37 | false | 0.031212;0.031214;0.031213;0.031213;0.031212 | 30749248;30749248;30749248;30749248;30749248 | 298624;297920;299296;299520;298496 | 0;0;0;0;0 |
22 | InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 94.667 | 313600 | 362752 | 98026496 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 3530.67 | 41.40 | 0.00 | 0.00 | true | 0.413675;0.416482;0.415848;0.412896;0.412749 | 0;0;0;0;0 | 49152;49152;49152;49152;49152 | 3328;3968;3072;3712;3552 |
23 | InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 35 35]] | 92.667 | 156928 | 181504 | 97242624 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 14.00 | 15374624 | 0.00 | 45664.00 | 3.10 | 336.69 | 1098.19 | false | 0.031243;0.031243;0.031244;0.031243;0.031244 | 15374624;15374624;15374624;15374624;15374624 | 0;0;0;0;0 | 45792;47648;45600;44064;45600 |
23 | InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 32 35 35]] | 92.667 | 156928 | 181504 | 97242624 | GPU_0_bfc | 24576 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 24576.00 | 10677.33 | 41.40 | 0.00 | 0.00 | true | 0.413958;0.416673;0.413697;0.415438;0.411989 | 0;0;0;0;0 | 24576;24576;24576;24576;24576 | 11296;10656;10496;10144;10880 |
24 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28 | 313600 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 512.00 | 0.00 | 45.30 | 153.12 | 19.60 | false | 0.456479;0.453852;0.452364;0.453897;0.452327 | 78400;78400;78400;78400;78400 | 5888;512;512;512;512 | 128;0;0;0;0 |
25 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 48 35 35]] | 20.667 | 235264 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 58800 | 192.00 | 234.67 | 44.10 | 137.81 | 14.70 | false | 0.441380;0.439423;0.440675;0.440900;0.442000 | 58800;58800;58800;58800;58800 | 192;192;192;192;192 | 192;128;384;256;256 |
26 | InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20 | 313600 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 256.00 | 44.00 | 153.12 | 19.60 | false | 0.440313;0.440308;0.440179;0.440464;0.440455 | 78400;78400;78400;78400;78400 | 512;256;256;256;256 | 384;384;256;128;128 |
27 | InceptionV3/InceptionV3/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 32 35 35]] | 20.333 | 156928 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 39200 | 128.00 | 21.33 | 44.30 | 262.50 | 9.80 | false | 0.442640;0.443726;0.441073;0.442230;0.443791 | 39200;39200;39200;39200;39200 | 128;128;128;128;128 | 64;0;256;0;0 |
28 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 20.333 | 313600 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438304;0.438201;0.438750;0.438359;0.438081 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
29 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 48 35 35]] | 19.333 | 235264 | 0 | 96301824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 128.00 | 43.70 | 0.00 | 0.00 | true | 0.436875;0.437292;0.437124;0.436922;0.436705 | 0;0;0;0;0 | 0;0;0;0;2560 | 64;128;128;128;128 |
30 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 133.667 | 470528 | 1724672 | 96772352 | GPU_0_bfc | 1254144 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.67 | 99993600 | 85.33 | 14912.00 | 12.50 | 6667.43 | 5356.70 | false | 0.124608;0.124622;0.124631;0.124631;0.124615 | 99993600;99993600;99993600;99993600;99993600 | 0;256;0;0;6656 | 15232;14944;14816;14976;14720 |
30 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 133.667 | 470528 | 1724672 | 96772352 | GPU_0_bfc | 1254144 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 21.33 | 44.80 | 0.00 | 0.00 | true | 0.447635;0.450179;0.447622;0.447584;0.446824 | 0;0;0;0;0 | 226048;221184;221184;221184;221184 | 64;0;256;0;0 |
30 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 133.667 | 470528 | 1724672 | 96772352 | GPU_0_bfc | 1254144 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 2922.67 | 6.20 | 121.93 | 97.18 | false | 0.062288;0.062287;0.062299;0.062296;0.062287 | 356352;356352;356352;356352;356352 | 6144;0;0;0;0 | 2976;2912;3136;2880;2848 |
31 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 149.333 | 313600 | 620800 | 96772352 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.00 | 191771200 | 0.00 | 2720.00 | 3.10 | 70504.12 | 3143.79 | false | 0.031244;0.031245;0.031245;0.031245;0.031244 | 191771200;191771200;191771200;191771200;191771200 | 2720;2720;2720;2720;2720 | 0;0;0;5120;0 |
31 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 149.333 | 313600 | 620800 | 96772352 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 307200.00 | 128.00 | 42.50 | 0.00 | 0.00 | true | 0.424589;0.426414;0.424698;0.426702;0.420951 | 0;0;0;0;0 | 64;128;128;128;128 | 307200;307200;307200;307200;307200 |
32 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27 | 470528 | 0 | 96537088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 0.00 | 54.60 | 306.25 | 29.40 | false | 0.545916;0.545616;0.544980;0.547036;0.546114 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;0;0;0;0 |
33 | InceptionV3/InceptionV3/Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20.667 | 313600 | 0 | 96537088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 0.00 | 44.00 | 306.25 | 19.60 | false | 0.440370;0.440601;0.439481;0.441703;0.440469 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 128;0;0;0;0 |
34 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 20.333 | 470528 | 0 | 96537088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 128.00 | 59.60 | 0.00 | 0.00 | true | 0.589998;0.597158;0.598219;0.597684;0.593582 | 0;0;0;0;0 | 0;128;128;128;128 | 0;0;0;0;0 |
35 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 124.333 | 470528 | 1724416 | 97007616 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.33 | 149022720 | 0.00 | 536917.33 | 12.50 | 277.55 | 6124.31 | false | 0.124726;0.124726;0.124731;0.124738;0.124726 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 536928;537280;536288;536544;537984 |
35 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 124.333 | 470528 | 1724416 | 97007616 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 331776.00 | 11616.00 | 44.10 | 0.00 | 0.00 | true | 0.441875;0.444108;0.440039;0.441339;0.434974 | 0;0;0;0;0 | 331776;331776;331776;331776;331776 | 12608;10688;11136;12128;11584 |
35 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 124.333 | 470528 | 1724416 | 97007616 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 189781.33 | 6.20 | 2.82 | 133.63 | true | 0.062257;0.062251;0.062232;0.062239;0.062245 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 191712;190080;188480;190528;188736 |
36 | InceptionV3/InceptionV3/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 26.667 | 470528 | 0 | 96537088 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 128.00 | 54.60 | 229.69 | 29.40 | false | 0.546405;0.544402;0.546477;0.546247;0.545714 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;128;128;128;128 |
38 | InceptionV3/InceptionV3/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 256 35 35]] | 26 | 1254400 | 0 | 96536832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 291360.00 | 60.80 | 0.00 | 0.00 | true | 0.608250;0.608271;0.606258;0.608461;0.605952 | 0;0;0;0;0 | 291936;291296;291264;290496;291520 | 0;0;0;0;0 |
39 | InceptionV3/InceptionV3/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 256 35 35]] | 47 | 1254400 | 1254400 | 97791232 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 7352239 | 0.00 | 402816.00 | 57.50 | 18.25 | 525.16 | false | 0.575065;0.574854;0.575588;0.575455;0.574538 | 7352239;7352239;7352239;7352239;7352239 | 402784;402880;402208;403392;402784 | 0;0;0;0;0 |
40 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 102.667 | 313600 | 379136 | 98104832 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 40972864 | 256.00 | 308320.00 | 3.10 | 132.78 | 2410.17 | false | 0.031221;0.031221;0.031219;0.031219;0.031220 | 40972864;40972864;40972864;40972864;40972864 | 307616;308800;308384;308096;308480 | 256;256;256;256;256 |
40 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 102.667 | 313600 | 379136 | 98104832 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 65536.00 | 170.67 | 41.40 | 0.00 | 0.00 | true | 0.412131;0.413151;0.413073;0.414856;0.414389 | 0;0;0;0;0 | 65536;65536;65536;65536;65536 | 384;128;128;256;128 |
41 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 98 | 235264 | 284416 | 98340096 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 40953264 | 0.00 | 162933.33 | 3.10 | 251.35 | 2409.02 | false | 0.031154;0.031154;0.031154;0.031153;0.031155 | 40953264;40953264;40953264;40953264;40953264 | 0;0;0;0;0 | 162464;162048;163584;163424;162912 |
41 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 98 | 235264 | 284416 | 98340096 | GPU_0_bfc | 49152 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 49152.00 | 42.67 | 43.30 | 0.00 | 0.00 | true | 0.432283;0.431428;0.433338;0.434724;0.438791 | 0;0;0;0;0 | 49152;49152;49152;49152;49152 | 128;0;0;128;0 |
42 | InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 96.667 | 313600 | 379136 | 98653696 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 40972864 | 0.00 | 329888.00 | 3.10 | 124.20 | 2410.17 | false | 0.031222;0.031221;0.031223;0.031223;0.031222 | 40972864;40972864;40972864;40972864;40972864 | 0;0;0;0;0 | 329568;330176;330048;329632;329984 |
42 | InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 96.667 | 313600 | 379136 | 98653696 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 65536.00 | 0.00 | 41.40 | 0.00 | 0.00 | true | 0.412888;0.412460;0.416021;0.416490;0.413082 | 0;0;0;0;0 | 67328;65536;65536;65536;65536 | 128;0;0;0;0 |
43 | InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 97.667 | 313600 | 379136 | 97712896 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 17.00 | 40972864 | 0.00 | 313930.67 | 3.10 | 130.52 | 2410.17 | false | 0.031223;0.031223;0.031224;0.031222;0.031221 | 40972864;40972864;40972864;40972864;40972864 | 0;6656;0;0;0 | 313920;314144;313824;313952;313920 |
43 | InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 97.667 | 313600 | 379136 | 97712896 | GPU_0_bfc | 65536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 65536.00 | 0.00 | 41.30 | 0.00 | 0.00 | true | 0.412577;0.415210;0.412581;0.412659;0.416298 | 0;0;0;0;0 | 65536;65536;65536;65536;65536 | 0;0;0;0;0 |
44 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 27.667 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 597.33 | 82944.00 | 44.50 | 0.94 | 19.60 | true | 0.443720;0.444104;0.445646;0.445160;0.445860 | 78400;78400;78400;78400;78400 | 4096;256;256;1280;256 | 85920;84256;82592;81984;81824 |
45 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 48 35 35]] | 22 | 235264 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 58800 | 192.00 | 51285.33 | 44.10 | 1.14 | 14.70 | true | 0.440555;0.440767;0.440495;0.442771;0.442695 | 58800;58800;58800;58800;58800 | 192;192;192;192;192 | 47872;48896;51712;53248;53248 |
46 | InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 21.333 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 44714.67 | 44.10 | 1.74 | 19.60 | true | 0.441988;0.440301;0.440459;0.440781;0.442291 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 44416;44800;44928;45568;44032 |
47 | InceptionV3/InceptionV3/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 120704.00 | 44.10 | 0.65 | 19.60 | true | 0.440350;0.439783;0.443227;0.441311;0.440840 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 121344;121344;120320;119296;120448 |
48 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 21.333 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438894;0.438009;0.438280;0.437970;0.439003 | 0;0;0;0;0 | 128;0;0;0;0 | 0;0;0;0;0 |
49 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv2d_0b_1x1/Relu | Relu | [[1 48 35 35]] | 19 | 235264 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 640.00 | 43.60 | 0.00 | 0.00 | true | 0.435834;0.435908;0.435514;0.435529;0.435703 | 0;0;0;0;0 | 0;0;0;0;0 | 512;640;640;640;640 |
50 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120.667 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 19.00 | 99993600 | 0.00 | 713013.33 | 12.50 | 140.24 | 5262.82 | false | 0.124623;0.124622;0.124624;0.124644;0.124641 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;0;0 | 713888;712288;712768;713568;712704 |
50 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120.667 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 49322.67 | 44.60 | 0.00 | 0.00 | true | 0.445058;0.446321;0.447097;0.446962;0.444507 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 49280;49280;49408;49664;46976 |
50 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120.667 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 170069.33 | 6.20 | 2.10 | 89.09 | true | 0.062248;0.062226;0.062240;0.062232;0.062240 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 169920;170816;167424;169472;172640 |
51 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 144.333 | 313600 | 620800 | 96929024 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.67 | 191771200 | 0.00 | 78869.33 | 3.10 | 2431.51 | 3109.79 | false | 0.031244;0.031244;0.031244;0.031245;0.031244 | 191771200;191771200;191771200;191771200;191771200 | 0;0;0;0;0 | 78144;78784;81344;79040;78784 |
51 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 144.333 | 313600 | 620800 | 96929024 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 307200.00 | 15210.67 | 42.60 | 0.00 | 0.00 | true | 0.428960;0.425949;0.425751;0.425201;0.420010 | 0;0;0;0;0 | 307200;307200;307200;307200;307200 | 15552;14016;14656;15424;16128 |
52 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27 | 470528 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 256.00 | 54.60 | 183.75 | 29.40 | false | 0.545459;0.545534;0.547717;0.544679;0.545917 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 256;640;256;256;256 |
53 | InceptionV3/InceptionV3/Mixed_5c/Branch_1/Conv_1_0c_5x5/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 21.667 | 313600 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 0.00 | 44.10 | 306.25 | 19.60 | false | 0.441346;0.440911;0.439535;0.441754;0.442243 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 128;0;0;0;0 |
54 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 21 | 470528 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 128.00 | 59.60 | 0.00 | 0.00 | true | 0.595489;0.595745;0.595862;0.595476;0.596178 | 0;0;0;0;0 | 0;0;3072;0;0 | 0;128;128;256;128 |
55 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122 | 470528 | 1941248 | 97164288 | GPU_0_bfc | 1470720 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 25.00 | 149022720 | 0.00 | 283317.33 | 12.50 | 525.99 | 5960.91 | false | 0.124732;0.124712;0.124731;0.124719;0.124729 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 283360;280832;283328;283264;283904 |
55 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122 | 470528 | 1941248 | 97164288 | GPU_0_bfc | 1470720 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 331776.00 | 0.00 | 44.60 | 0.00 | 0.00 | true | 0.446993;0.447611;0.445311;0.445226;0.438314 | 0;0;0;0;0 | 331776;331776;331776;331776;333824 | 0;0;0;0;0 |
55 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122 | 470528 | 1941248 | 97164288 | GPU_0_bfc | 1470720 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 75349.33 | 6.20 | 7.09 | 133.63 | true | 0.062136;0.062123;0.062131;0.062124;0.062109 | 534528;534528;534528;534528;534528 | 0;0;6400;0;0 | 75360;75904;75872;74816;73472 |
56 | InceptionV3/InceptionV3/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 26.333 | 470528 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 170.67 | 54.60 | 212.02 | 29.40 | false | 0.546720;0.546176;0.544320;0.546421;0.547910 | 117600;117600;117600;117600;117600 | 384;2432;384;384;384 | 0;256;128;128;1024 |
58 | InceptionV3/InceptionV3/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 288 35 35]] | 25.667 | 1411328 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 261952.00 | 60.80 | 0.00 | 0.00 | true | 0.611082;0.609922;0.606630;0.602207;0.608415 | 0;0;0;0;0 | 0;0;0;0;0 | 261920;261920;261888;262016;262272 |
59 | InceptionV3/InceptionV3/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 288 35 35]] | 47 | 2430464 | 2430464 | 99124224 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 8061987 | 0.00 | 534112.00 | 58.80 | 15.09 | 537.47 | true | 0.588232;0.587568;0.586621;0.587580;0.587599 | 8061987;8061987;8061987;8061987;8061987 | 532576;535456;533056;533824;538240 | 0;0;0;0;0 |
60 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 101.667 | 313600 | 387328 | 99437824 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.33 | 46084672 | 0.00 | 369888.00 | 3.10 | 124.59 | 2513.76 | false | 0.031225;0.031224;0.031226;0.031227;0.031226 | 46084672;46084672;46084672;46084672;46084672 | 0;0;0;0;0 | 369632;369120;370144;370144;369888 |
60 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 101.667 | 313600 | 387328 | 99437824 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 73728.00 | 128.00 | 41.80 | 0.00 | 0.00 | true | 0.421149;0.417747;0.418027;0.418239;0.417872 | 0;0;0;0;0 | 73728;73728;73728;73728;73728 | 256;128;128;0;128 |
61 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 98 | 235264 | 290560 | 99673088 | GPU_0_bfc | 55296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.00 | 46065072 | 0.00 | 199306.67 | 3.10 | 231.13 | 2559.17 | false | 0.031163;0.031163;0.031163;0.031163;0.031163 | 46065072;46065072;46065072;46065072;46065072 | 0;0;0;0;0 | 198880;199936;199104;201504;196352 |
61 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 48 35 35]] | 98 | 235264 | 290560 | 99673088 | GPU_0_bfc | 55296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 55296.00 | 128.00 | 43.20 | 0.00 | 0.00 | true | 0.431021;0.430995;0.432756;0.431137;0.436405 | 0;0;0;0;0 | 55296;55296;55296;55296;55296 | 128;128;128;128;128 |
62 | InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 98.667 | 313600 | 387328 | 99986688 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.67 | 46084672 | 0.00 | 293173.33 | 3.10 | 157.19 | 2468.78 | false | 0.031222;0.031221;0.031223;0.031223;0.031224 | 46084672;46084672;46084672;46084672;46084672 | 256;0;0;0;0 | 291680;292416;295424;289280;295520 |
62 | InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 98.667 | 313600 | 387328 | 99986688 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 73728.00 | 22122.67 | 41.90 | 0.00 | 0.00 | true | 0.418174;0.419463;0.418972;0.417870;0.418735 | 0;0;0;0;0 | 73728;73728;73728;73728;73728 | 21984;22528;21760;21856;24448 |
63 | InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 98 | 313600 | 387328 | 98888960 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.00 | 46084672 | 0.00 | 203104.00 | 3.10 | 226.90 | 2560.26 | false | 0.031224;0.031224;0.031225;0.031223;0.031224 | 46084672;46084672;46084672;46084672;46084672 | 256;0;0;0;0 | 203968;203744;202560;202240;203008 |
63 | InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 98 | 313600 | 387328 | 98888960 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 73728.00 | 13909.33 | 42.20 | 0.00 | 0.00 | true | 0.421648;0.421259;0.422674;0.422500;0.421502 | 0;0;0;0;0 | 73728;73728;73728;80128;73728 | 14592;13696;12544;17120;13440 |
64 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 128.00 | 44.50 | 204.17 | 19.60 | false | 0.444185;0.445423;0.445578;0.445290;0.445206 | 78400;78400;78400;78400;78400 | 2816;256;256;256;256 | 256;128;128;128;128 |
65 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 48 35 35]] | 21 | 235264 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 58800 | 192.00 | 35882.67 | 44.10 | 1.63 | 14.70 | true | 0.440139;0.441270;0.439943;0.441903;0.442163 | 58800;58800;58800;58800;58800 | 192;192;192;192;192 | 35712;36224;35968;35840;35840 |
66 | InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 138378.67 | 44.10 | 0.57 | 19.60 | true | 0.441420;0.441061;0.439986;0.441318;0.441600 | 78400;78400;78400;78400;78400 | 139008;139136;137856;138240;137888 | 256;256;256;256;256 |
67 | InceptionV3/InceptionV3/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20.667 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 28032.00 | 44.10 | 2.77 | 19.60 | true | 0.441288;0.440851;0.440718;0.441213;0.440691 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 28032;28032;28032;28032;28032 |
68 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 20 | 313600 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438400;0.438336;0.438258;0.438316;0.437807 | 0;0;0;0;0 | 1024;0;0;0;0 | 128;0;0;0;0 |
69 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 48 35 35]] | 19.667 | 235264 | 0 | 96458496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 128.00 | 43.50 | 0.00 | 0.00 | true | 0.435066;0.434489;0.434571;0.434418;0.434592 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;128;128;128 |
70 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.333 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.33 | 99993600 | 0.00 | 758442.67 | 12.50 | 131.84 | 5454.30 | false | 0.124627;0.124610;0.124628;0.124629;0.124631 | 99993600;99993600;99993600;99993600;99993600 | 758208;758720;758400;748640;759040 | 0;0;0;0;0 |
70 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.333 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 221184.00 | 0.00 | 44.50 | 0.00 | 0.00 | true | 0.445607;0.445386;0.445247;0.444796;0.443529 | 0;0;0;0;0 | 221184;221184;221184;221184;222976 | 0;0;0;0;0 |
70 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.333 | 470528 | 1306624 | 96929024 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.33 | 356352 | 0.00 | 7296.00 | 6.20 | 48.84 | 106.92 | false | 0.062192;0.062214;0.062217;0.062198;0.062201 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 7296;7296;7296;12160;7296 |
71 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 144.667 | 313600 | 620800 | 96929024 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 61.00 | 191771200 | 0.00 | 237045.33 | 3.10 | 809.01 | 3143.79 | false | 0.031245;0.031244;0.031244;0.031244;0.031244 | 191771200;191771200;191771200;191771200;191771200 | 0;0;0;0;0 | 237792;235904;237536;237184;236416 |
71 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 144.667 | 313600 | 620800 | 96929024 | GPU_0_bfc | 307200 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 307200.00 | 115637.33 | 42.80 | 0.00 | 0.00 | true | 0.428488;0.428380;0.426279;0.428952;0.421585 | 0;0;0;0;0 | 307200;307200;307200;307200;307200 | 114752;116096;114400;117024;116064 |
72 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 26.667 | 470528 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 42.67 | 54.70 | 275.62 | 29.40 | false | 0.543615;0.546777;0.546978;0.546428;0.547368 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;128;0;0;128 |
73 | InceptionV3/InceptionV3/Mixed_5d/Branch_1/Conv2d_0b_5x5/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 21.333 | 313600 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 0.00 | 44.10 | 306.25 | 19.60 | false | 0.440751;0.440487;0.440602;0.442529;0.441005 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 128;0;0;0;0 |
74 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 20.333 | 470528 | 0 | 96693760 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 128.00 | 59.70 | 0.00 | 0.00 | true | 0.597118;0.597935;0.598101;0.596885;0.597341 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;256;128;128 |
75 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122.667 | 862464 | 2116352 | 97556224 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.00 | 149022720 | 0.00 | 436778.67 | 12.50 | 341.19 | 6209.28 | false | 0.124729;0.124735;0.124732;0.124737;0.124739 | 149022720;149022720;149022720;149022720;149022720 | 435840;426976;439328;438336;436160 | 0;0;0;0;0 |
75 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122.667 | 862464 | 2116352 | 97556224 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 331776.00 | 0.00 | 44.50 | 0.00 | 0.00 | true | 0.443500;0.443469;0.447192;0.444690;0.448088 | 0;0;0;0;0 | 160;0;0;0;0 | 331776;333568;331776;331776;331776 |
75 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 122.667 | 862464 | 2116352 | 97556224 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 160106.67 | 6.20 | 3.34 | 133.63 | true | 0.062065;0.062053;0.062052;0.062035;0.062042 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 160704;163264;159808;159808;158976 |
76 | InceptionV3/InceptionV3/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27.667 | 862464 | 0 | 97085696 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 768.00 | 54.60 | 102.08 | 29.40 | false | 0.544899;0.544917;0.546861;0.546689;0.545142 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 768;512;896;640;1024 |
78 | InceptionV3/InceptionV3/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 288 35 35]] | 24.333 | 2430464 | 0 | 97712896 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 239744.00 | 60.70 | 0.00 | 0.00 | true | 0.608265;0.606933;0.605727;0.607033;0.604431 | 0;0;0;0;0 | 0;0;0;0;0 | 239808;240288;239520;239648;239776 |
79 | InceptionV3/InceptionV3/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 288 17 17]] | 38.333 | 333056 | 333056 | 98045952 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.00 | 83232 | 0.00 | 228778.67 | 34.50 | 0.36 | 16.65 | true | 0.344748;0.345996;0.344831;0.345974;0.345224 | 83232;83232;83232;83232;83232 | 229440;228320;229312;228704;228320 | 0;0;0;0;0 |
80 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 101.667 | 313600 | 387328 | 98359552 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.67 | 46084672 | 0.00 | 260810.67 | 3.10 | 176.70 | 2468.78 | false | 0.031221;0.031222;0.031222;0.031223;0.031223 | 46084672;46084672;46084672;46084672;46084672 | 0;0;0;0;0 | 260128;261248;260384;260896;261152 |
80 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 101.667 | 313600 | 387328 | 98359552 | GPU_0_bfc | 73728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 73728.00 | 298.67 | 41.90 | 0.00 | 0.00 | true | 0.419075;0.418715;0.418941;0.418603;0.418671 | 0;0;0;0;0 | 73728;73728;73728;73728;73728 | 512;256;256;256;384 |
81 | InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 294 | 443904 | 4425216 | 98803456 | GPU_0_bfc | 3981312 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 190.33 | 637120896 | 3400362.67 | 1111680.00 | 4.70 | 141.20 | 3347.40 | false | 0.047047;0.047080;0.046903;0.046996;0.047146 | 637120896;637120896;637120896;637120896;637120896 | 1122816;1110528;1100288;1111936;1112576 | 3410560;3418240;3364352;3390464;3400064 |
81 | InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 294 | 443904 | 4425216 | 98803456 | GPU_0_bfc | 3981312 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 23.33 | 0 | 3986389.33 | 3581365.33 | 47.20 | 0.00 | 0.00 | true | 0.471467;0.473334;0.471617;0.472882;0.461471 | 0;0;0;0;0 | 3989184;3985024;3984960;3991808;3984064 | 3587072;3601440;3571328;3577280;3579744 |
82 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 27.333 | 313600 | 0 | 96372992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 78400 | 314176.00 | 97226.67 | 45.60 | 0.19 | 16.80 | true | 0.454904;0.455301;0.455757;0.455877;0.455602 | 78400;78400;78400;78400;78400 | 98336;97440;96672;97568;96416 | 314176;320832;314176;314176;314176 |
83 | InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 21 | 443904 | 0 | 96372992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 38688.00 | 51.90 | 2.76 | 27.74 | true | 0.518594;0.519073;0.519681;0.520055;0.518242 | 110976;110976;110976;110976;110976 | 7168;1536;1536;1536;1536 | 36128;38304;39840;39712;38048 |
84 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19.667 | 313600 | 0 | 96372992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 341.33 | 43.80 | 0.00 | 0.00 | true | 0.437802;0.437811;0.438072;0.437343;0.437983 | 0;0;0;0;0 | 0;1024;0;0;0 | 1792;768;128;128;128 |
85 | InceptionV3/InceptionV3/Mixed_6a/Branch_0/Conv2d_1a_1x1/Relu | Relu | [[1 384 17 17]] | 19 | 443904 | 0 | 96372992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 1322.67 | 57.20 | 0.00 | 0.00 | true | 0.572722;0.571949;0.574411;0.564346;0.572079 | 0;0;0;0;0 | 5120;0;0;0;0 | 2944;3584;512;512;512 |
86 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.667 | 470528 | 1306624 | 96843520 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.67 | 99993600 | 0.00 | 445024.00 | 12.50 | 224.69 | 5356.70 | false | 0.124615;0.124644;0.124617;0.124635;0.124629 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;0;0 | 441376;443168;465056;444000;447904 |
86 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.667 | 470528 | 1306624 | 96843520 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 102432.00 | 45.00 | 0.00 | 0.00 | true | 0.450300;0.449518;0.449397;0.450343;0.448381 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 101408;102560;103328;103840;99744 |
86 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.667 | 470528 | 1306624 | 96843520 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 109312.00 | 6.20 | 3.26 | 97.18 | true | 0.062249;0.062259;0.062212;0.062246;0.062245 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 106112;101632;112000;109824;112640 |
87 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27 | 470528 | 0 | 96529920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 170.67 | 54.50 | 212.02 | 29.40 | false | 0.543790;0.546091;0.544747;0.545379;0.545743 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 384;0;0;128;1536 |
88 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 22 | 470528 | 0 | 96529920 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 59.60 | 0.00 | 0.00 | true | 0.595373;0.596272;0.595836;0.596420;0.596215 | 0;0;0;0;0 | 128;0;0;0;0 | 0;0;0;0;0 |
89 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 17 17]] | 131.333 | 111104 | 442880 | 96641024 | GPU_0_bfc | 331776 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.67 | 53111904 | 0.00 | 586.67 | 3.10 | 90531.60 | 1163.03 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 53111904;53111904;53111904;53111904;53111904 | 544;544;544;672;2592 | 0;0;0;0;0 |
89 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 17 17]] | 131.333 | 111104 | 442880 | 96641024 | GPU_0_bfc | 331776 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331776.00 | 33024.00 | 44.20 | 0.00 | 0.00 | true | 0.441383;0.442397;0.449570;0.443239;0.435427 | 0;0;0;0;0 | 34176;32768;32768;33536;32384 | 331776;331776;331776;331776;342528 |
90 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 17 17]] | 26.333 | 111104 | 0 | 96170496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 27744 | 384.00 | 128.00 | 44.80 | 54.19 | 6.94 | false | 0.449700;0.447180;0.453075;0.447823;0.447507 | 27744;27744;27744;27744;27744 | 384;384;384;384;384 | 128;128;128;384;128 |
91 | InceptionV3/InceptionV3/Mixed_6a/Branch_1/Conv2d_1a_1x1/Relu | Relu | [[1 96 17 17]] | 20.667 | 111104 | 0 | 96170496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438915;0.439216;0.438578;0.439215;0.438565 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;0 |
93 | InceptionV3/InceptionV3/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 768 17 17]] | 47 | 887808 | 887808 | 97058048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 11.00 | 5475924 | 0.00 | 557344.00 | 50.10 | 9.83 | 497.81 | true | 0.500639;0.501817;0.501206;0.501005;0.501159 | 5475924;5475924;5475924;5475924;5475924 | 0;0;0;0;0 | 551168;556448;564512;556608;558976 |
94 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 124.667 | 147968 | 541184 | 97206016 | GPU_0_bfc | 393216 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 62951552 | 0.00 | 348128.00 | 3.10 | 180.83 | 1573.79 | false | 0.031248;0.031249;0.031248;0.031248;0.031248 | 62951552;62951552;62951552;62951552;62951552 | 0;0;0;0;0 | 351840;345824;343008;354944;346720 |
94 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 124.667 | 147968 | 541184 | 97206016 | GPU_0_bfc | 393216 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 393216.00 | 139989.33 | 44.40 | 0.00 | 0.00 | true | 0.442415;0.442705;0.444613;0.443611;0.447736 | 0;0;0;0;0 | 393216;393216;393216;393216;393216 | 139616;143904;136896;138944;141408 |
95 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 125 | 147968 | 541184 | 97353984 | GPU_0_bfc | 393216 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.33 | 62951552 | 0.00 | 105834.67 | 3.10 | 594.81 | 1560.80 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 62951552;62951552;62951552;62951552;62951552 | 0;0;0;0;0 | 106432;100800;105920;105152;109760 |
95 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 125 | 147968 | 541184 | 97353984 | GPU_0_bfc | 393216 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.33 | 0 | 393898.67 | 72352.00 | 44.60 | 0.00 | 0.00 | true | 0.443862;0.448889;0.445949;0.444326;0.450099 | 0;0;0;0;0 | 72480;79648;72352;72224;71200 | 393216;393216;395264;395264;393216 |
96 | InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 123 | 221952 | 811776 | 97575936 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.33 | 94427328 | 0.00 | 220394.67 | 3.10 | 428.45 | 2341.19 | false | 0.031245;0.031245;0.031245;0.031246;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 224864;218208;210048;224096;218880 | 0;0;0;0;0 |
96 | InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 123 | 221952 | 811776 | 97575936 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 191136.00 | 45.10 | 0.00 | 0.00 | true | 0.451832;0.451570;0.450872;0.452189;0.448802 | 0;0;0;0;0 | 589824;590848;589824;589824;589824 | 186880;192128;200928;189056;192224 |
97 | InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 122.667 | 221952 | 887808 | 96910080 | GPU_0_bfc | 665856 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.33 | 94427328 | 0.00 | 576.00 | 3.10 | 163936.33 | 2341.19 | false | 0.031244;0.031244;0.031244;0.031244;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 704;576;576;576;576 | 0;0;0;0;0 |
97 | InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 122.667 | 221952 | 887808 | 96910080 | GPU_0_bfc | 665856 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 256.00 | 44.70 | 0.00 | 0.00 | true | 0.448984;0.446554;0.445568;0.447539;0.442610 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 256;256;2784;256;256 |
98 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 28 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 256.00 | 44.00 | 48.17 | 9.25 | false | 0.440310;0.439707;0.439433;0.439293;0.439372 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 256;256;256;256;384 |
99 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 21 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 0.00 | 43.40 | 72.25 | 9.25 | false | 0.432748;0.432366;0.433530;0.435057;0.434389 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 0;0;0;0;0 |
100 | InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.667 | 221952 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 213.33 | 44.20 | 56.54 | 13.87 | false | 0.441989;0.441099;0.443876;0.441704;0.442894 | 55488;55488;55488;55488;55488 | 256;256;128;256;128 | 768;768;768;768;768 |
101 | InceptionV3/InceptionV3/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.333 | 221952 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 0.00 | 44.20 | 72.25 | 13.87 | false | 0.441825;0.442154;0.442182;0.442071;0.443535 | 55488;55488;55488;55488;55488 | 768;768;768;768;768 | 128;0;0;0;0 |
102 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 128 17 17]] | 20.333 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 128.00 | 43.80 | 0.00 | 0.00 | true | 0.438253;0.438231;0.438701;0.437098;0.437235 | 0;0;0;0;0 | 256;0;0;0;0 | 0;128;128;128;128 |
103 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 128 17 17]] | 19 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 43.20 | 0.00 | 0.00 | true | 0.432497;0.432013;0.431861;0.431392;0.431926 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
104 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 188.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 18.00 | 109252608 | 13237.33 | 437002.67 | 6.20 | 242.65 | 6069.59 | false | 0.062460;0.062455;0.062453;0.062454;0.062452 | 109252608;109252608;109252608;109252608;109252608 | 428416;436928;436608;437472;444224 | 13664;13408;12640;12128;16992 |
104 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 188.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 5832704 | 15018.67 | 1287477.33 | 9.60 | 4.48 | 486.06 | true | 0.096426;0.096024;0.095547;0.095550;0.095445 | 5832704;5832704;5832704;5832704;5832704 | 17408;15872;14592;14592;14336 | 1266080;1271424;1300896;1301408;1290112 |
104 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 188.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 774656 | 8352.00 | 357504.00 | 1.60 | 2.12 | 70.42 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 368480;357600;356640;357760;357152 | 15008;8352;8608;8096;8096 |
104 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 188.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 726784 | 7509.33 | 336234.67 | 1.60 | 2.11 | 103.83 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 726784;726784;726784;726784;726784 | 332320;338560;338272;338112;331360 | 10240;7680;7424;7424;7424 |
104 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 188.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 458752.00 | 256.00 | 45.00 | 0.00 | 0.00 | true | 0.451467;0.450603;0.449542;0.449557;0.446677 | 0;0;0;0;0 | 458752;458752;458752;458752;458752 | 256;256;1280;256;256 |
105 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 165.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 774656 | 16810.67 | 69792.00 | 1.60 | 8.94 | 55.33 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 19712;16128;16640;16896;16896 | 64576;81024;63776;62240;90912 |
105 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 165.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 14.00 | 109252608 | 0.00 | 327093.33 | 6.20 | 334.01 | 7803.76 | false | 0.062441;0.062441;0.062441;0.062441;0.062441 | 109252608;109252608;109252608;109252608;109252608 | 0;0;0;0;0 | 328800;324512;332928;327968;312800 |
105 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 165.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 5832704 | 0.00 | 557226.67 | 9.30 | 10.47 | 564.47 | true | 0.092442;0.092604;0.092506;0.092640;0.092514 | 5832704;5832704;5832704;5832704;5832704 | 0;2048;0;0;0 | 562688;543200;556096;560640;554944 |
105 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 165.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 9.00 | 726784 | 13184.00 | 271530.67 | 1.60 | 2.55 | 80.75 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 726784;726784;726784;726784;726784 | 13184;13184;13184;13184;13184 | 271680;271552;271424;271616;271104 |
105 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 165.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 459520.00 | 284010.67 | 44.80 | 0.00 | 0.00 | true | 0.448011;0.447701;0.448365;0.448417;0.443802 | 0;0;0;0;0 | 461824;459520;459520;459520;459520 | 291168;281856;287552;281536;282624 |
106 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 26.667 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 1536.00 | 981.33 | 44.20 | 14.69 | 9.25 | true | 0.453836;0.442818;0.440408;0.441652;0.441358 | 36992;36992;36992;36992;36992 | 1184;896;896;1024;1024 | 1792;1536;1536;1536;1536 |
107 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 21.667 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 88490.67 | 43.30 | 0.42 | 9.25 | true | 0.433921;0.432682;0.433366;0.433150;0.433327 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 88576;88448;88448;88576;88448 |
108 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 128 17 17]] | 20 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 1280.00 | 49898.67 | 45.60 | 0.00 | 0.00 | true | 0.456396;0.461799;0.456207;0.456379;0.455147 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 50304;49664;50208;48544;49824 |
109 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 128 17 17]] | 19.333 | 147968 | 0 | 96022272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 85.33 | 43.40 | 0.00 | 0.00 | true | 0.433729;0.433786;0.432952;0.433419;0.433960 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;128;0;384 |
110 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 14.00 | 109252608 | 0.00 | 375658.67 | 6.20 | 290.83 | 7803.76 | false | 0.062441;0.062441;0.062441;0.062441;0.062441 | 109252608;109252608;109252608;109252608;109252608 | 0;0;0;0;0 | 377056;371776;374560;381376;375360 |
110 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 5832704 | 0.00 | 577066.67 | 9.30 | 10.11 | 546.80 | true | 0.092609;0.092639;0.092650;0.092499;0.092586 | 5832704;5832704;5832704;5832704;5832704 | 586112;572096;567008;584448;574656 | 0;5120;0;0;0 |
110 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 9.33 | 774656 | 256.00 | 144810.67 | 1.60 | 5.34 | 83.00 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 256;256;256;256;256 | 142432;156480;145504;142144;146496 |
110 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 459072.00 | 194325.33 | 44.70 | 0.00 | 0.00 | true | 0.447053;0.448682;0.447585;0.446362;0.443508 | 0;0;0;0;0 | 193696;190240;197920;191360;199840 | 459072;458816;459072;459072;459072 |
110 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 3427328 | 96170240 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 726784 | 0.00 | 284501.33 | 1.60 | 2.55 | 181.70 | true | 0.015624;0.015625;0.015625;0.015624;0.015625 | 726784;726784;726784;726784;726784 | 284736;284160;284416;284352;285216 | 128;0;0;0;0 |
111 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164 | 221952 | 5488384 | 96244224 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 21.67 | 163878912 | 0.00 | 704426.67 | 8.60 | 232.64 | 7563.53 | false | 0.086239;0.086133;0.086208;0.086300;0.086345 | 163878912;163878912;163878912;163878912;163878912 | 0;0;0;0;0 | 704256;703648;703104;705600;705376 |
111 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164 | 221952 | 5488384 | 96244224 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 8749056 | 0.00 | 1695808.00 | 12.50 | 5.16 | 749.90 | true | 0.124015;0.125849;0.124795;0.124695;0.124221 | 8749056;8749056;8749056;8749056;8749056 | 0;0;256;0;0 | 1698624;1699968;1685536;1702496;1688832 |
111 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164 | 221952 | 5488384 | 96244224 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.33 | 0 | 688128.00 | 486773.33 | 44.90 | 0.00 | 0.00 | true | 0.449059;0.452110;0.447654;0.449282;0.440558 | 0;0;0;0;0 | 485504;493952;487680;485088;487136 | 688128;688128;688128;688128;688128 |
111 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164 | 221952 | 5488384 | 96244224 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 7.00 | 774656 | 1877.33 | 269450.67 | 1.60 | 2.86 | 110.67 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 268928;268832;272896;270592;267904 | 2048;1792;3584;1536;1792 |
111 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164 | 221952 | 5488384 | 96244224 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.67 | 1090176 | 9930.67 | 369248.00 | 2.00 | 2.88 | 192.37 | true | 0.019834;0.019827;0.019828;0.019831;0.019826 | 1090176;1090176;1090176;1090176;1090176 | 8896;10272;10208;9952;9632 | 368704;370816;368448;365600;370592 |
112 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 27.667 | 147968 | 0 | 96096256 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 36992 | 130176.00 | 4608.00 | 45.30 | 0.27 | 7.93 | true | 0.453017;0.451709;0.455788;0.453209;0.450392 | 36992;36992;36992;36992;36992 | 4608;4608;4448;4736;4608 | 130432;130048;129920;130048;130560 |
113 | InceptionV3/InceptionV3/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 23.667 | 221952 | 0 | 96096256 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1109.33 | 124586.67 | 44.30 | 0.44 | 13.87 | true | 0.450415;0.443103;0.442347;0.441878;0.444084 | 55488;55488;55488;55488;55488 | 768;6656;768;1792;768 | 124800;124576;124896;124384;122592 |
114 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 128 17 17]] | 19.667 | 147968 | 0 | 96096256 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 1280.00 | 16906.67 | 44.80 | 0.00 | 0.00 | true | 0.456285;0.447813;0.447833;0.447863;0.447622 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 16864;16960;16864;16896;16960 |
115 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 149.667 | 147968 | 3427328 | 96244224 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 14.00 | 109252608 | 597.33 | 494346.67 | 6.20 | 220.74 | 7803.76 | false | 0.062441;0.062440;0.062441;0.062440;0.062441 | 109252608;109252608;109252608;109252608;109252608 | 0;0;0;1792;5120 | 483840;492896;493824;502080;496320 |
115 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 149.667 | 147968 | 3427328 | 96244224 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 5832704 | 256.00 | 1331893.33 | 9.30 | 4.38 | 530.25 | true | 0.092549;0.092624;0.092539;0.092548;0.092504 | 5832704;5832704;5832704;5832704;5832704 | 1338496;1345664;1326304;1320256;1330880 | 0;256;256;256;256 |
115 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 149.667 | 147968 | 3427328 | 96244224 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 7.00 | 774656 | 256.00 | 239338.67 | 1.60 | 3.23 | 110.67 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 256;256;256;256;0 | 240480;238528;237536;251648;239008 |
115 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 149.667 | 147968 | 3427328 | 96244224 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 459178.67 | 310346.67 | 44.70 | 0.00 | 0.00 | true | 0.447414;0.449249;0.448006;0.446166;0.444894 | 0;0;0;0;0 | 459264;460288;459264;459008;459008 | 304416;307776;314752;313152;310112 |
115 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 149.667 | 147968 | 3427328 | 96244224 | GPU_0_bfc | 3279360 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 726784 | 341.33 | 283733.33 | 1.60 | 2.56 | 155.73 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 726784;726784;726784;726784;726784 | 285184;285472;284096;281920;279424 | 0;0;0;1024;1536 |
116 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 27 | 147968 | 0 | 96096256 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 768.00 | 512.00 | 44.00 | 28.90 | 9.25 | false | 0.440045;0.440066;0.439125;0.440993;0.439976 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 768;768;768;768;768 |
117 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 128 17 17]] | 20.333 | 147968 | 0 | 96096256 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 832.00 | 43.80 | 0.00 | 0.00 | true | 0.437857;0.437768;0.437852;0.438212;0.438100 | 0;0;0;0;0 | 0;0;0;0;0 | 960;832;512;832;832 |
118 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164.333 | 295936 | 5562368 | 96392192 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 21.00 | 163878912 | 0.00 | 734517.33 | 8.60 | 223.11 | 7803.76 | false | 0.086239;0.086272;0.086013;0.086146;0.085978 | 163878912;163878912;163878912;163878912;163878912 | 0;0;0;0;0 | 735264;738560;737408;726336;730880 |
118 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164.333 | 295936 | 5562368 | 96392192 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 8749056 | 0.00 | 1697376.00 | 12.50 | 5.15 | 749.90 | true | 0.124271;0.124990;0.125064;0.124337;0.124235 | 8749056;8749056;8749056;8749056;8749056 | 0;0;0;0;0 | 1702432;1712864;1690016;1696320;1693376 |
118 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164.333 | 295936 | 5562368 | 96392192 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 10.00 | 774656 | 2048.00 | 280757.33 | 1.60 | 2.74 | 77.47 | true | 0.015625;0.015625;0.015625;0.015625;0.015625 | 774656;774656;774656;774656;774656 | 1792;2048;2048;2048;2048 | 280576;279488;277280;291520;282208 |
118 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164.333 | 295936 | 5562368 | 96392192 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 688128.00 | 447477.33 | 44.90 | 0.00 | 0.00 | true | 0.449198;0.450170;0.448945;0.451509;0.444563 | 0;0;0;0;0 | 443328;443264;452064;471776;447040 | 688128;688128;688128;688128;688128 |
118 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 164.333 | 295936 | 5562368 | 96392192 | GPU_0_bfc | 5266432 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1090176 | 27818.67 | 328160.00 | 2.00 | 3.06 | 181.70 | true | 0.019856;0.019861;0.019856;0.019852;0.019854 | 1090176;1090176;1090176;1090176;1090176 | 28160;25792;27776;27520;28320 | 329152;327648;328064;328768;326784 |
119 | InceptionV3/InceptionV3/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26.333 | 295936 | 0 | 96244224 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 512.00 | 44.80 | 43.35 | 13.87 | false | 0.447520;0.446353;0.447853;0.448786;0.448150 | 55488;55488;55488;55488;55488 | 768;768;768;768;2816 | 512;512;512;512;512 |
121 | InceptionV3/InceptionV3/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 768 17 17]] | 24.333 | 1183744 | 0 | 96466176 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 512.00 | 159360.00 | 65.30 | 0.00 | 0.00 | true | 0.653823;0.654404;0.656226;0.644263;0.652013 | 0;0;0;0;0 | 159232;159584;160000;159264;158528 | 512;512;512;512;512 |
122 | InceptionV3/InceptionV3/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 768 17 17]] | 45 | 887808 | 887808 | 97353984 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 4592664 | 5376.00 | 666005.33 | 50.00 | 6.84 | 382.72 | true | 0.498286;0.499628;0.499737;0.500162;0.502735 | 4592664;4592664;4592664;4592664;4592664 | 4864;5632;6400;5376;5120 | 665824;666336;666496;665568;665856 |
123 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 131 | 185088 | 676608 | 97539072 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.00 | 78689440 | 11605.33 | 321322.67 | 3.10 | 236.36 | 1788.40 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 78689440;78689440;78689440;78689440;78689440 | 12032;12288;11520;11264;11264 | 318720;310816;322592;325248;322656 |
123 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 131 | 185088 | 676608 | 97539072 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 492288.00 | 233429.33 | 45.50 | 0.00 | 0.00 | true | 0.455571;0.454790;0.455740;0.452091;0.454189 | 0;0;0;0;0 | 492288;497152;492288;492032;492288 | 238784;238336;231360;228896;230592 |
124 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 122.667 | 185088 | 676608 | 97724160 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.33 | 78689440 | 256.00 | 150058.67 | 3.10 | 523.50 | 1950.99 | false | 0.031247;0.031247;0.031247;0.031246;0.031247 | 78689440;78689440;78689440;78689440;78689440 | 256;256;256;256;256 | 149664;147360;154976;153152;147136 |
124 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 122.667 | 185088 | 676608 | 97724160 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 491520.00 | 39093.33 | 45.60 | 0.00 | 0.00 | true | 0.455785;0.455775;0.455708;0.456018;0.453703 | 0;0;0;0;0 | 491520;491520;491520;491520;491520 | 41376;40224;36864;36960;40096 |
125 | InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 123 | 221952 | 811776 | 97946112 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 94427328 | 0.00 | 169909.33 | 3.10 | 555.75 | 2360.68 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 169152;169664;168928;173536;170912 | 0;0;0;0;0 |
125 | InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 123 | 221952 | 811776 | 97946112 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589888.00 | 137205.33 | 45.00 | 0.00 | 0.00 | true | 0.449646;0.449067;0.451382;0.451134;0.449434 | 0;0;0;0;0 | 589888;589888;592448;589888;589888 | 138816;136832;140032;135008;135968 |
126 | InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 120.667 | 221952 | 1183744 | 96984320 | GPU_0_bfc | 961792 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 41.67 | 94427328 | 42.67 | 18730.67 | 3.10 | 5029.86 | 2266.24 | false | 0.031245;0.031244;0.031244;0.031245;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 0;0;128;0;384 | 17728;16704;21664;20064;18400 |
126 | InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 120.667 | 221952 | 1183744 | 96984320 | GPU_0_bfc | 961792 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 37728.00 | 45.10 | 0.00 | 0.00 | true | 0.450456;0.451014;0.452486;0.450753;0.448066 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 37536;40896;37632;35456;38016 |
127 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 46240 | 1024.00 | 256.00 | 45.40 | 36.12 | 10.67 | false | 0.453426;0.454241;0.454269;0.454525;0.451908 | 46240;46240;46240;46240;46240 | 128;256;256;384;256 | 1024;1024;1024;1024;1024 |
128 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 21 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 2432.00 | 437.33 | 44.10 | 16.12 | 11.56 | true | 0.439733;0.441737;0.443039;0.438986;0.441756 | 46240;46240;46240;46240;46240 | 7808;640;640;6016;640 | 416;448;448;544;288 |
129 | InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20 | 221952 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 0.00 | 44.20 | 72.25 | 13.87 | false | 0.441171;0.441189;0.441047;0.442413;0.443470 | 55488;55488;55488;55488;55488 | 0;0;0;0;0 | 768;768;768;768;768 |
130 | InceptionV3/InceptionV3/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 19.667 | 221952 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 6954.67 | 44.20 | 7.19 | 13.87 | true | 0.441937;0.440454;0.442697;0.442037;0.442245 | 55488;55488;55488;55488;55488 | 1024;768;768;768;768 | 6912;6912;7040;6272;7040 |
131 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 17 17]] | 20.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 128.00 | 43.80 | 0.00 | 0.00 | true | 0.438270;0.438125;0.438343;0.437826;0.437659 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;128;128;128 |
132 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 160 17 17]] | 18.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.50 | 0.00 | 0.00 | true | 0.434723;0.434446;0.434897;0.435177;0.434577 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
133 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173.333 | 185088 | 5368832 | 96281600 | GPU_0_bfc | 5183744 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 170337280 | 778.67 | 670752.00 | 6.90 | 253.66 | 6551.43 | false | 0.068542;0.068663;0.068617;0.068618;0.068632 | 170337280;170337280;170337280;170337280;170337280 | 608;1120;864;608;864 | 632000;672928;677888;686880;661440 |
133 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173.333 | 185088 | 5368832 | 96281600 | GPU_0_bfc | 5183744 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 9113600 | 1514.67 | 2509024.00 | 12.90 | 3.63 | 781.14 | true | 0.130316;0.128943;0.129019;0.129338;0.129490 | 9113600;9113600;9113600;9113600;9113600 | 6272;1344;1600;1600;1344 | 2540608;2506048;2500416;2477856;2520608 |
133 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173.333 | 185088 | 5368832 | 96281600 | GPU_0_bfc | 5183744 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.00 | 968320 | 7370.67 | 509365.33 | 1.70 | 1.87 | 96.83 | true | 0.016606;0.016605;0.016605;0.016605;0.016604 | 968320;968320;968320;968320;968320 | 15904;6944;7968;7072;7072 | 522176;501472;505344;517120;505632 |
133 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173.333 | 185088 | 5368832 | 96281600 | GPU_0_bfc | 5183744 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 716800.00 | 27317.33 | 44.40 | 0.00 | 0.00 | true | 0.440397;0.446637;0.445349;0.447216;0.438420 | 0;0;0;0;0 | 716800;716800;716800;716800;716800 | 24736;25536;30528;29536;26880 |
133 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173.333 | 185088 | 5368832 | 96281600 | GPU_0_bfc | 5183744 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 908480 | 8320.00 | 360650.67 | 1.70 | 2.46 | 129.78 | true | 0.016591;0.016592;0.016592;0.016592;0.016591 | 908480;908480;908480;908480;908480 | 7744;8320;8320;8320;8704 | 360896;360672;361312;360384;343392 |
134 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177.667 | 185088 | 5553920 | 96281600 | GPU_0_bfc | 5368832 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 170337280 | 96.00 | 648885.33 | 6.90 | 262.47 | 6551.43 | false | 0.068631;0.068663;0.068692;0.068728;0.068596 | 170337280;170337280;170337280;170337280;170337280 | 96;96;96;96;96 | 652288;646912;656640;637248;647456 |
134 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177.667 | 185088 | 5553920 | 96281600 | GPU_0_bfc | 5368832 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 968320 | 175744.00 | 397568.00 | 1.70 | 1.69 | 64.55 | true | 0.016680;0.016674;0.016661;0.016656;0.016679 | 968320;968320;968320;968320;968320 | 174976;175744;175872;175744;175744 | 400480;393568;394912;402688;397312 |
134 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177.667 | 185088 | 5553920 | 96281600 | GPU_0_bfc | 5368832 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 9113600 | 224.00 | 2503104.00 | 13.20 | 3.64 | 781.14 | true | 0.131999;0.132086;0.133176;0.131349;0.131265 | 9113600;9113600;9113600;9113600;9113600 | 224;224;224;224;224 | 2485760;2470464;2522656;2595008;2500896 |
134 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177.667 | 185088 | 5553920 | 96281600 | GPU_0_bfc | 5368832 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 717312.00 | 608213.33 | 45.20 | 0.00 | 0.00 | true | 0.452064;0.456465;0.448776;0.455017;0.445841 | 0;0;0;0;0 | 717312;717312;717312;717312;717312 | 604544;611040;613376;607872;605728 |
134 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177.667 | 185088 | 5553920 | 96281600 | GPU_0_bfc | 5368832 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.67 | 908480 | 7680.00 | 321930.67 | 1.70 | 2.76 | 136.27 | true | 0.016593;0.016595;0.016594;0.016595;0.016592 | 908480;908480;908480;908480;908480 | 7168;7680;7680;7680;7680 | 321248;323200;322176;322368;321184 |
135 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 28 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 46240 | 14464.00 | 8042.67 | 45.50 | 2.05 | 10.67 | true | 0.454100;0.455330;0.456472;0.456026;0.455125 | 46240;46240;46240;46240;46240 | 8000;8128;8128;8000;8000 | 14464;14592;14208;14720;14336 |
136 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 21 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 896.00 | 2069.33 | 45.10 | 15.59 | 11.56 | true | 0.449452;0.453897;0.449097;0.450053;0.452708 | 46240;46240;46240;46240;46240 | 896;896;896;3200;896 | 1280;1344;3840;3584;1280 |
137 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 160 17 17]] | 20.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 1066.67 | 174602.67 | 45.70 | 0.00 | 0.00 | true | 0.457176;0.457275;0.460035;0.457060;0.456763 | 0;0;0;0;0 | 1152;1024;6144;1024;1024 | 175584;175584;170336;172640;176448 |
138 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 160 17 17]] | 18.667 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 640.00 | 43.40 | 0.00 | 0.00 | true | 0.434411;0.434156;0.434711;0.434234;0.433686 | 0;0;0;0;0 | 0;0;0;0;0 | 768;640;640;640;512 |
139 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 169 | 185088 | 5739008 | 96281600 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 25.67 | 170337280 | 512.00 | 639925.33 | 6.90 | 265.97 | 6636.43 | false | 0.068641;0.068709;0.068665;0.068582;0.068724 | 170337280;170337280;170337280;170337280;170337280 | 512;512;768;512;512 | 643904;639072;623072;638432;642272 |
139 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 169 | 185088 | 5739008 | 96281600 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 9113600 | 85.33 | 2586837.33 | 13.00 | 3.52 | 759.47 | true | 0.130402;0.129309;0.130295;0.130183;0.129549 | 9113600;9113600;9113600;9113600;9113600 | 0;256;1280;0;0 | 2573408;2554688;2592192;2614720;2594912 |
139 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 169 | 185088 | 5739008 | 96281600 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 10.00 | 968320 | 832.00 | 401216.00 | 1.70 | 2.41 | 96.83 | true | 0.016646;0.016646;0.016647;0.016647;0.016644 | 968320;968320;968320;968320;968320 | 576;832;832;832;832 | 399360;399200;389376;405088;413280 |
139 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 169 | 185088 | 5739008 | 96281600 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 717568.00 | 594880.00 | 44.50 | 0.00 | 0.00 | true | 0.445181;0.444928;0.456334;0.444786;0.441879 | 0;0;0;0;0 | 717568;717312;717568;717568;717568 | 593984;593536;596576;597088;594080 |
139 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 169 | 185088 | 5739008 | 96281600 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 908480 | 512.00 | 316778.67 | 1.70 | 2.86 | 181.70 | true | 0.016655;0.016655;0.016650;0.016648;0.016652 | 908480;908480;908480;908480;908480 | 512;512;512;512;512 | 316288;316480;318752;316960;316896 |
140 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 332544 | 6656768 | 96429056 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 204404736 | 608.00 | 735626.67 | 8.60 | 277.64 | 7861.72 | false | 0.086493;0.086289;0.086506;0.086428;0.086524 | 204404736;204404736;204404736;204404736;204404736 | 733184;739776;733920;712096;743456 | 608;608;608;608;608 |
140 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 332544 | 6656768 | 96429056 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 10936320 | 672.00 | 2878997.33 | 15.50 | 3.80 | 911.36 | true | 0.154352;0.155456;0.155042;0.156576;0.155118 | 10936320;10936320;10936320;10936320;10936320 | 672;672;672;672;672 | 2856896;2875936;2885440;2896896;2875616 |
140 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 332544 | 6656768 | 96429056 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 968320 | 177536.00 | 506645.33 | 1.70 | 1.42 | 93.71 | true | 0.016641;0.016638;0.016649;0.016647;0.016636 | 968320;968320;968320;968320;968320 | 509888;505120;499296;506208;508608 | 177536;177280;177536;177536;177536 |
140 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 332544 | 6656768 | 96429056 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 860672.00 | 749056.00 | 44.90 | 0.00 | 0.00 | true | 0.448726;0.447859;0.449767;0.449750;0.444180 | 0;0;0;0;0 | 860672;861696;860672;860672;860672 | 754752;743616;751936;739552;751616 |
140 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 332544 | 6656768 | 96429056 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1090176 | 27648.00 | 342709.33 | 2.00 | 2.94 | 181.70 | true | 0.019831;0.019833;0.019832;0.019831;0.019827 | 1090176;1090176;1090176;1090176;1090176 | 27360;30944;27936;27648;27264 | 344160;341184;342624;344224;341344 |
141 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 29 | 185088 | 0 | 96243968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 29376.00 | 30538.67 | 45.80 | 0.77 | 11.56 | true | 0.458723;0.461447;0.458126;0.457990;0.457672 | 46240;46240;46240;46240;46240 | 29760;33856;27328;27072;31040 | 30304;30592;30464;31104;30560 |
142 | InceptionV3/InceptionV3/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 21.667 | 332544 | 0 | 96243968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1024.00 | 10176.00 | 45.10 | 4.95 | 13.87 | true | 0.451078;0.450694;0.451109;0.450782;0.450772 | 55488;55488;55488;55488;55488 | 1024;1024;1024;1792;1024 | 10304;9248;10080;10432;10144 |
143 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 160 17 17]] | 19.667 | 185088 | 0 | 96243968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1280.00 | 145973.33 | 45.60 | 0.00 | 0.00 | true | 0.456500;0.456539;0.455976;0.456724;0.455775 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 146112;145792;146016;145248;146144 |
144 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5739008 | 96429056 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 25.67 | 170337280 | 512.00 | 637333.33 | 6.90 | 267.05 | 6636.43 | false | 0.068695;0.068735;0.068787;0.068607;0.068768 | 170337280;170337280;170337280;170337280;170337280 | 512;512;512;512;2560 | 638912;636832;639360;636256;604800 |
144 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5739008 | 96429056 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 9113600 | 789.33 | 2468117.33 | 13.00 | 3.69 | 781.14 | true | 0.130135;0.130882;0.130053;0.129897;0.129316 | 9113600;9113600;9113600;9113600;9113600 | 512;768;768;832;1024 | 2464416;2477824;2469088;2463200;2470848 |
144 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5739008 | 96429056 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 9.00 | 968320 | 7189.33 | 453770.67 | 1.70 | 2.10 | 107.59 | true | 0.016609;0.016614;0.016604;0.016590;0.016616 | 968320;968320;968320;968320;968320 | 9024;7232;7488;6848;6464 | 453344;456896;450944;461216;451072 |
144 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5739008 | 96429056 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 717397.33 | 593120.00 | 45.20 | 0.00 | 0.00 | true | 0.451164;0.450397;0.451715;0.455891;0.452704 | 0;0;0;0;0 | 717312;717312;717568;719360;717312 | 590336;585216;595936;593184;595840 |
144 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5739008 | 96429056 | GPU_0_bfc | 5553920 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 908480 | 128.00 | 340810.67 | 1.70 | 2.66 | 181.70 | true | 0.016637;0.016637;0.016641;0.016638;0.016646 | 908480;908480;908480;908480;908480 | 128;128;128;64;768 | 340544;341024;340864;341376;338944 |
145 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.667 | 185088 | 0 | 96243968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 1088.00 | 1088.00 | 45.30 | 21.25 | 11.56 | false | 0.453751;0.454203;0.453126;0.451771;0.453069 | 46240;46240;46240;46240;46240 | 1088;1088;1088;1088;1088 | 1088;1088;1088;1088;1088 |
146 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 160 17 17]] | 20 | 185088 | 0 | 96243968 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 426.67 | 384.00 | 43.90 | 0.00 | 0.00 | true | 0.438625;0.438913;0.438919;0.438971;0.438888 | 0;0;0;0;0 | 384;512;512;384;384 | 512;384;384;384;384 |
147 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 177.333 | 370176 | 6694400 | 96614144 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 204404736 | 608.00 | 741109.33 | 8.60 | 275.58 | 7861.72 | false | 0.086454;0.086480;0.086557;0.086401;0.086313 | 204404736;204404736;204404736;204404736;204404736 | 608;608;608;608;608 | 742272;744192;736448;741632;739424 |
147 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 177.333 | 370176 | 6694400 | 96614144 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 968320 | 18304.00 | 439498.67 | 1.70 | 2.12 | 69.17 | true | 0.016909;0.016817;0.016841;0.016836;0.016852 | 968320;968320;968320;968320;968320 | 18048;17664;27008;17536;19200 | 438432;447776;451872;432288;422208 |
147 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 177.333 | 370176 | 6694400 | 96614144 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 10936320 | 512.00 | 2889418.67 | 15.40 | 3.78 | 911.36 | true | 0.153945;0.154788;0.153461;0.152875;0.153296 | 10936320;10936320;10936320;10936320;10936320 | 512;576;512;512;512 | 2887360;2881856;2893120;2887776;2900480 |
147 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 177.333 | 370176 | 6694400 | 96614144 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 860672.00 | 763477.33 | 44.90 | 0.00 | 0.00 | true | 0.456908;0.453657;0.446030;0.446126;0.442869 | 0;0;0;0;0 | 860672;860672;860672;860672;860672 | 760960;752544;760096;775296;769376 |
147 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 177.333 | 370176 | 6694400 | 96614144 | GPU_0_bfc | 6324224 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.67 | 1090176 | 33344.00 | 353194.67 | 2.00 | 2.82 | 192.37 | true | 0.019857;0.019857;0.019856;0.019852;0.019851 | 1090176;1090176;1090176;1090176;1090176 | 31392;31648;37184;36992;30848 | 350848;357056;353632;349440;355104 |
148 | InceptionV3/InceptionV3/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27.333 | 370176 | 0 | 96429056 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1088.00 | 640.00 | 45.60 | 32.11 | 13.87 | false | 0.457645;0.456647;0.456478;0.455620;0.454833 | 55488;55488;55488;55488;55488 | 1088;1088;1088;1088;1088 | 640;640;1024;640;640 |
150 | InceptionV3/InceptionV3/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 768 17 17]] | 24 | 1331968 | 0 | 96614400 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 512.00 | 326410.67 | 65.70 | 0.00 | 0.00 | true | 0.656334;0.657914;0.656977;0.659605;0.656298 | 0;0;0;0;0 | 325984;327488;326272;326912;326048 | 512;512;512;512;512 |
151 | InceptionV3/InceptionV3/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 768 17 17]] | 44 | 1109760 | 1109760 | 97724160 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 4453239 | 768.00 | 667200.00 | 50.30 | 6.67 | 371.10 | true | 0.501912;0.506712;0.503576;0.501562;0.503259 | 4453239;4453239;4453239;4453239;4453239 | 256;768;768;768;768 | 666912;666976;667264;667488;667360 |
152 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 129 | 185088 | 676608 | 97909248 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 43.33 | 78689440 | 7936.00 | 349866.67 | 3.10 | 219.92 | 1815.92 | false | 0.031248;0.031249;0.031249;0.031248;0.031248 | 78689440;78689440;78689440;78689440;78689440 | 6912;7936;7936;7936;7936 | 354528;345344;356224;344384;349728 |
152 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 129 | 185088 | 676608 | 97909248 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 492288.00 | 214997.33 | 45.70 | 0.00 | 0.00 | true | 0.457746;0.455559;0.456886;0.457310;0.452552 | 0;0;0;0;0 | 215456;216320;212832;213376;216160 | 492544;492288;492288;492288;492288 |
153 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 123 | 185088 | 676608 | 98094336 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 78689440 | 0.00 | 120192.00 | 3.10 | 654.70 | 1967.24 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 78689440;78689440;78689440;78689440;78689440 | 0;0;0;0;0 | 122240;118016;121760;119072;119744 |
153 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 123 | 185088 | 676608 | 98094336 | GPU_0_bfc | 491520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 491520.00 | 18144.00 | 45.40 | 0.00 | 0.00 | true | 0.453654;0.455167;0.454751;0.455062;0.449502 | 0;0;0;0;0 | 491520;491520;491520;491520;491520 | 18496;17504;18432;16384;18688 |
154 | InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 98316288 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.33 | 94427328 | 0.00 | 116970.67 | 3.10 | 807.27 | 2341.19 | false | 0.031244;0.031244;0.031245;0.031244;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 0;0;0;0;0 | 118464;127872;111424;120960;111488 |
154 | InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 98316288 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 148341.33 | 44.80 | 0.00 | 0.00 | true | 0.448412;0.446882;0.449405;0.447741;0.445327 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 151680;139616;152704;140640;155840 |
155 | InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 121.333 | 221952 | 1331968 | 97206272 | GPU_0_bfc | 1110016 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 94427328 | 1408.00 | 13781.33 | 3.10 | 6216.69 | 2360.68 | false | 0.031244;0.031244;0.031244;0.031245;0.031244 | 94427328;94427328;94427328;94427328;94427328 | 1792;1408;1408;1408;1024 | 11488;13728;14016;13600;14112 |
155 | InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 121.333 | 221952 | 1331968 | 97206272 | GPU_0_bfc | 1110016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 591616.00 | 64736.00 | 45.30 | 0.00 | 0.00 | true | 0.452614;0.453739;0.452924;0.452335;0.451592 | 0;0;0;0;0 | 65440;64992;64928;63648;64288 | 596480;589824;589824;595200;589824 |
156 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.667 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 1152.00 | 405.33 | 45.40 | 29.69 | 11.56 | false | 0.453603;0.454300;0.452820;0.454982;0.452248 | 46240;46240;46240;46240;46240 | 2432;1152;1152;1152;1152 | 320;320;320;576;576 |
157 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 21.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 640.00 | 277.33 | 44.00 | 50.41 | 11.56 | false | 0.439726;0.439292;0.440321;0.440763;0.441942 | 46240;46240;46240;46240;46240 | 640;640;640;640;640 | 864;832;0;0;0 |
158 | InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.333 | 221952 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 24640.00 | 44.30 | 2.18 | 13.87 | true | 0.442529;0.440102;0.442274;0.443389;0.443055 | 55488;55488;55488;55488;55488 | 24480;24544;24480;24896;24928 | 768;768;768;768;768 |
159 | InceptionV3/InceptionV3/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.333 | 221952 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 117.33 | 44.20 | 62.67 | 13.87 | false | 0.443868;0.440918;0.442421;0.441766;0.442997 | 55488;55488;55488;55488;55488 | 768;768;768;768;768 | 288;160;32;32;160 |
160 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 160 17 17]] | 20 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 21.33 | 43.80 | 0.00 | 0.00 | true | 0.437943;0.438679;0.438158;0.438598;0.437755 | 0;0;0;0;0 | 4096;0;0;0;0 | 0;0;64;0;128 |
161 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 160 17 17]] | 19.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 1621.33 | 43.40 | 0.00 | 0.00 | true | 0.433495;0.434067;0.434311;0.433898;0.433900 | 0;0;0;0;0 | 768;0;0;0;0 | 3360;3392;256;256;1248 |
162 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173 | 185088 | 5331712 | 96281600 | GPU_0_bfc | 5146624 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 170337280 | 608.00 | 725066.67 | 6.90 | 234.73 | 6551.43 | false | 0.068636;0.068604;0.068564;0.068673;0.068646 | 170337280;170337280;170337280;170337280;170337280 | 608;608;608;608;608 | 728832;722080;707584;724288;733696 |
162 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173 | 185088 | 5331712 | 96281600 | GPU_0_bfc | 5146624 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 9113600 | 1429.33 | 2546112.00 | 13.00 | 3.58 | 781.14 | true | 0.129980;0.130027;0.129784;0.129979;0.130387 | 9113600;9113600;9113600;9113600;9113600 | 2550432;2545920;2569024;2539648;2541984 | 1344;1344;1344;1664;1600 |
162 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173 | 185088 | 5331712 | 96281600 | GPU_0_bfc | 5146624 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 9.67 | 968320 | 5365.33 | 504405.33 | 1.70 | 1.90 | 100.17 | true | 0.016606;0.016606;0.016607;0.016606;0.016605 | 968320;968320;968320;968320;968320 | 5408;5408;5536;5280;5280 | 500544;508704;499776;512480;503968 |
162 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173 | 185088 | 5331712 | 96281600 | GPU_0_bfc | 5146624 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 716949.33 | 205834.67 | 44.60 | 0.00 | 0.00 | true | 0.447184;0.445857;0.445025;0.446731;0.439047 | 0;0;0;0;0 | 716864;718912;716864;716864;717120 | 203296;206432;211264;207680;203392 |
162 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 173 | 185088 | 5331712 | 96281600 | GPU_0_bfc | 5146624 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.67 | 908480 | 7914.67 | 339189.33 | 1.70 | 2.62 | 136.27 | true | 0.016592;0.016591;0.016591;0.016591;0.016591 | 908480;908480;908480;908480;908480 | 7488;8000;8000;8000;7744 | 337472;339232;340800;339136;339200 |
163 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177 | 185088 | 5146624 | 96281600 | GPU_0_bfc | 4961536 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 25.33 | 170337280 | 512.00 | 695317.33 | 6.90 | 244.80 | 6723.93 | false | 0.068726;0.068668;0.068696;0.068667;0.068638 | 170337280;170337280;170337280;170337280;170337280 | 512;512;512;2560;512 | 697920;699552;695104;688544;692928 |
163 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177 | 185088 | 5146624 | 96281600 | GPU_0_bfc | 4961536 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 968320 | 190954.67 | 362133.33 | 1.70 | 1.75 | 64.55 | true | 0.016621;0.016610;0.016624;0.016607;0.016610 | 968320;968320;968320;968320;968320 | 367328;361632;353664;357952;366816 | 190208;191360;190464;191296;191104 |
163 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177 | 185088 | 5146624 | 96281600 | GPU_0_bfc | 4961536 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 9113600 | 512.00 | 2540512.00 | 12.90 | 3.59 | 759.47 | true | 0.133047;0.128469;0.129692;0.128777;0.128923 | 9113600;9113600;9113600;9113600;9113600 | 512;512;512;512;512 | 2570720;2572864;2536704;2479328;2514112 |
163 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177 | 185088 | 5146624 | 96281600 | GPU_0_bfc | 4961536 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 717504.00 | 612650.67 | 44.40 | 0.00 | 0.00 | true | 0.446966;0.443156;0.446353;0.441680;0.439897 | 0;0;0;0;0 | 717504;717504;717504;717504;717504 | 613056;610400;609696;618016;614496 |
163 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 177 | 185088 | 5146624 | 96281600 | GPU_0_bfc | 4961536 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 908480 | 7552.00 | 312373.33 | 1.70 | 2.84 | 151.41 | true | 0.016595;0.016594;0.016595;0.016594;0.016593 | 908480;908480;908480;908480;908480 | 7040;7296;9728;8064;7296 | 314688;311936;307072;310496;314848 |
164 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.333 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 46240 | 53610.67 | 8085.33 | 45.90 | 0.75 | 9.91 | true | 0.458593;0.458471;0.459292;0.458546;0.456028 | 46240;46240;46240;46240;46240 | 8000;8384;8256;8000;8000 | 54208;53312;54592;53312;51648 |
165 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 20.667 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 896.00 | 1962.67 | 45.00 | 16.18 | 11.56 | true | 0.451052;0.449189;0.452665;0.449466;0.447607 | 46240;46240;46240;46240;46240 | 896;896;896;896;896 | 2048;1920;1920;2048;1920 |
166 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 160 17 17]] | 20.667 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 1280.00 | 175733.33 | 45.70 | 0.00 | 0.00 | true | 0.457196;0.457113;0.457071;0.457898;0.456196 | 0;0;0;0;0 | 175808;175072;175808;176064;175584 | 1280;1280;6144;1280;1280 |
167 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 160 17 17]] | 19 | 185088 | 0 | 96096512 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 597.33 | 43.40 | 0.00 | 0.00 | true | 0.434173;0.434582;0.434069;0.433986;0.434106 | 0;0;0;0;0 | 0;0;0;0;0 | 640;640;512;512;640 |
168 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5123840 | 96281600 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 170337280 | 608.00 | 675157.33 | 6.90 | 252.07 | 6551.43 | false | 0.068643;0.068680;0.068719;0.068637;0.068589 | 170337280;170337280;170337280;170337280;170337280 | 674624;682176;681856;662304;668992 | 608;608;608;608;608 |
168 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5123840 | 96281600 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 9113600 | 0.00 | 2579562.67 | 13.00 | 3.53 | 781.14 | true | 0.129224;0.130054;0.130210;0.129895;0.130534 | 9113600;9113600;9113600;9113600;9113600 | 0;0;0;0;0 | 2541824;2612032;2582400;2608096;2548192 |
168 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5123840 | 96281600 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 9.33 | 968320 | 256.00 | 400490.67 | 1.70 | 2.42 | 103.75 | true | 0.016646;0.016647;0.016647;0.016647;0.016647 | 968320;968320;968320;968320;968320 | 388576;402016;401696;420288;397760 | 0;1280;256;256;256 |
168 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5123840 | 96281600 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 717312.00 | 605568.00 | 44.50 | 0.00 | 0.00 | true | 0.445064;0.444786;0.450679;0.443826;0.443471 | 0;0;0;0;0 | 717312;717312;717568;717312;717312 | 609152;608256;604096;598528;604352 |
168 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 168.333 | 185088 | 5123840 | 96281600 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 908480 | 554.67 | 303402.67 | 1.70 | 2.99 | 194.66 | true | 0.016645;0.016649;0.016648;0.016648;0.016649 | 908480;908480;908480;908480;908480 | 512;640;512;640;512 | 303104;303808;303200;305632;303200 |
169 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 171.667 | 221952 | 6694400 | 96318464 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 204404736 | 672.00 | 810816.00 | 8.60 | 251.89 | 7861.72 | false | 0.086432;0.086345;0.086595;0.086384;0.086601 | 204404736;204404736;204404736;204404736;204404736 | 672;672;672;672;672 | 832832;811968;816000;799712;804480 |
169 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 171.667 | 221952 | 6694400 | 96318464 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 10936320 | 512.00 | 2913397.33 | 15.40 | 3.75 | 911.36 | true | 0.155205;0.153271;0.153937;0.153444;0.153357 | 10936320;10936320;10936320;10936320;10936320 | 512;512;512;512;512 | 2904576;2906304;2912768;2921120;2948832 |
169 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 171.667 | 221952 | 6694400 | 96318464 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 968320 | 186496.00 | 407754.67 | 1.70 | 1.63 | 93.71 | true | 0.016605;0.016605;0.016605;0.016611;0.016605 | 968320;968320;968320;968320;968320 | 403744;426592;407232;410208;405824 | 186496;186496;186496;186496;186496 |
169 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 171.667 | 221952 | 6694400 | 96318464 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 860416.00 | 753354.67 | 44.50 | 0.00 | 0.00 | true | 0.446096;0.444102;0.445822;0.446933;0.442120 | 0;0;0;0;0 | 745216;749504;767520;750112;760448 | 860416;860416;860416;860416;860416 |
169 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 171.667 | 221952 | 6694400 | 96318464 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.33 | 1090176 | 25408.00 | 349952.00 | 2.00 | 2.90 | 204.42 | true | 0.019831;0.019833;0.019831;0.019833;0.019830 | 1090176;1090176;1090176;1090176;1090176 | 25632;25824;24768;24864;25728 | 349440;349632;350080;350144;350528 |
170 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.333 | 185088 | 0 | 96133376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 46240 | 140224.00 | 30677.33 | 45.70 | 0.27 | 9.91 | true | 0.455321;0.458093;0.458229;0.455899;0.455610 | 46240;46240;46240;46240;46240 | 30720;30528;30592;30720;30720 | 141504;140608;140736;138688;139328 |
171 | InceptionV3/InceptionV3/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 21.667 | 221952 | 0 | 96133376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1024.00 | 8458.67 | 45.10 | 5.85 | 13.87 | true | 0.451127;0.449665;0.450121;0.459666;0.450257 | 55488;55488;55488;55488;55488 | 1024;1024;1024;1024;1024 | 9152;8640;8352;8384;8352 |
172 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 160 17 17]] | 20 | 185088 | 0 | 96133376 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 1280.00 | 146602.67 | 45.60 | 0.00 | 0.00 | true | 0.456239;0.456366;0.456332;0.456292;0.455324 | 0;0;0;0;0 | 1280;1280;1280;1280;2560 | 145504;146976;147072;146560;146272 |
173 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 164.667 | 332544 | 5271296 | 96465920 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 26.00 | 170337280 | 512.00 | 683264.00 | 6.90 | 249.11 | 6551.43 | false | 0.068723;0.068726;0.068726;0.068759;0.068793 | 170337280;170337280;170337280;170337280;170337280 | 512;512;512;512;512 | 682944;682720;684128;689216;675968 |
173 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 164.667 | 332544 | 5271296 | 96465920 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 9113600 | 768.00 | 2500469.33 | 13.00 | 3.64 | 828.51 | true | 0.130229;0.129686;0.129862;0.129965;0.130446 | 9113600;9113600;9113600;9113600;9113600 | 512;768;768;768;768 | 2520000;2495424;2495488;2501664;2504256 |
173 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 164.667 | 332544 | 5271296 | 96465920 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 717312.00 | 595989.33 | 44.80 | 0.00 | 0.00 | true | 0.463020;0.454895;0.442983;0.446503;0.443042 | 0;0;0;0;0 | 717312;717312;717312;717312;717312 | 588576;599968;591456;601056;596544 |
173 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 164.667 | 332544 | 5271296 | 96465920 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 7.00 | 968320 | 1301.33 | 434538.67 | 1.70 | 2.22 | 138.33 | true | 0.016652;0.016640;0.016634;0.016642;0.016643 | 968320;968320;968320;968320;968320 | 1216;1216;1472;1216;6336 | 427072;434016;436064;433536;443584 |
173 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 160 17 17]] | 164.667 | 332544 | 5271296 | 96465920 | GPU_0_bfc | 4938752 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 908480 | 1194.67 | 328693.33 | 1.70 | 2.75 | 181.70 | true | 0.016632;0.016631;0.016635;0.016634;0.016630 | 908480;908480;908480;908480;908480 | 326272;327648;330624;328544;329888 | 3072;384;640;768;2176 |
174 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 160 17 17]] | 27.333 | 332544 | 0 | 96280832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 46240 | 896.00 | 896.00 | 44.50 | 25.80 | 11.56 | false | 0.445467;0.444644;0.446136;0.444564;0.444347 | 46240;46240;46240;46240;46240 | 896;896;896;896;896 | 896;896;896;896;896 |
175 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 160 17 17]] | 20 | 332544 | 0 | 96280832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 608.00 | 768.00 | 44.90 | 0.00 | 0.00 | true | 0.448578;0.449163;0.449205;0.448770;0.455576 | 0;0;0;0;0 | 896;768;768;736;768 | 608;608;608;608;608 |
176 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 221952 | 6694400 | 96502784 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 25.67 | 204404736 | 672.00 | 756021.33 | 8.60 | 270.13 | 7963.72 | false | 0.086423;0.086344;0.086590;0.086442;0.086416 | 204404736;204404736;204404736;204404736;204404736 | 672;672;5792;672;672 | 749856;755296;760384;752384;770976 |
176 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 221952 | 6694400 | 96502784 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 968320 | 140874.67 | 432373.33 | 1.70 | 1.69 | 64.55 | true | 0.016709;0.016712;0.016704;0.016698;0.016690 | 968320;968320;968320;968320;968320 | 141216;139808;143648;141344;140064 | 431264;434816;422560;431040;440448 |
176 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 221952 | 6694400 | 96502784 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 10936320 | 512.00 | 2831786.67 | 15.40 | 3.86 | 911.36 | true | 0.154540;0.154201;0.153311;0.152986;0.153157 | 10936320;10936320;10936320;10936320;10936320 | 512;512;768;512;512 | 2828256;2837088;2830016;2845888;2814272 |
176 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 221952 | 6694400 | 96502784 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 860416.00 | 744682.67 | 44.80 | 0.00 | 0.00 | true | 0.449975;0.449486;0.447810;0.447357;0.441866 | 0;0;0;0;0 | 860416;860416;860416;860416;860416 | 754208;743008;746496;744544;742176 |
176 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 175.667 | 221952 | 6694400 | 96502784 | GPU_0_bfc | 6472448 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 44416.00 | 347104.00 | 2.00 | 2.78 | 218.04 | true | 0.019854;0.019855;0.019853;0.019855;0.019855 | 1090176;1090176;1090176;1090176;1090176 | 346368;347840;343392;347104;349024 | 43744;44736;45600;42336;44768 |
177 | InceptionV3/InceptionV3/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 28.333 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 1088.00 | 1216.00 | 45.60 | 24.08 | 11.89 | false | 0.455846;0.457036;0.455978;0.456365;0.455812 | 55488;55488;55488;55488;55488 | 1216;1216;1088;1216;1216 | 1088;6208;1088;1088;1088 |
179 | InceptionV3/InceptionV3/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 768 17 17]] | 24.667 | 1480192 | 0 | 96762624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 512.00 | 287392.00 | 65.50 | 0.00 | 0.00 | true | 0.655413;0.655781;0.653849;0.655357;0.654096 | 0;0;0;0;0 | 512;512;2816;512;512 | 288544;288064;283360;287456;286656 |
180 | InceptionV3/InceptionV3/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 768 17 17]] | 44.667 | 1331712 | 1331712 | 98094336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12.00 | 4548759 | 1280.00 | 665589.33 | 50.20 | 6.82 | 379.06 | true | 0.502912;0.502260;0.501512;0.500757;0.501686 | 4548759;4548759;4548759;4548759;4548759 | 1024;1280;1280;1280;1280 | 665632;664576;665568;665888;665568 |
181 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 130.333 | 221952 | 811776 | 98316288 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 43.33 | 94427328 | 7936.00 | 379744.00 | 3.10 | 243.57 | 2179.11 | false | 0.031249;0.031249;0.031249;0.031248;0.031249 | 94427328;94427328;94427328;94427328;94427328 | 6912;7936;7936;7936;7936 | 382720;374336;386304;382176;372832 |
181 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 130.333 | 221952 | 811776 | 98316288 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 590762.67 | 295274.67 | 45.20 | 0.00 | 0.00 | true | 0.452878;0.453069;0.453099;0.450782;0.449931 | 0;0;0;0;0 | 295328;299360;289856;291136;313792 | 591232;590592;590848;590848;590592 |
182 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 128.333 | 221952 | 811776 | 98538240 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 94427328 | 0.00 | 126112.00 | 3.10 | 748.76 | 2360.68 | false | 0.031245;0.031245;0.031245;0.031245;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 128512;124448;124768;125056;130432 | 0;0;0;1024;0 |
182 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 128.333 | 221952 | 811776 | 98538240 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 100437.33 | 45.10 | 0.00 | 0.00 | true | 0.451621;0.450926;0.451678;0.450301;0.452451 | 0;0;0;0;0 | 589824;589824;589824;589824;589824 | 98368;102016;101536;101408;92448 |
183 | InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 127 | 221952 | 811776 | 98760192 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 94427328 | 0.00 | 128501.33 | 3.10 | 734.84 | 2360.68 | false | 0.031246;0.031246;0.031245;0.031245;0.031245 | 94427328;94427328;94427328;94427328;94427328 | 0;0;0;0;0 | 127456;122528;128224;131264;129824 |
183 | InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 127 | 221952 | 811776 | 98760192 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 114474.67 | 45.00 | 0.00 | 0.00 | true | 0.451789;0.450218;0.452167;0.448322;0.447988 | 0;0;0;0;0 | 115296;118368;115712;103072;112416 | 589824;592896;589824;589824;589824 |
184 | InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 97501952 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 94427328 | 109482.67 | 29877.33 | 3.10 | 677.58 | 2098.39 | false | 0.031244;0.031245;0.031244;0.031245;0.031244 | 94427328;94427328;94427328;94427328;94427328 | 109440;110720;108416;109952;109056 | 27552;29280;30080;30560;30272 |
184 | InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 97501952 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 589824.00 | 33696.00 | 45.00 | 0.00 | 0.00 | true | 0.449788;0.451442;0.451336;0.450045;0.446659 | 0;0;0;0;0 | 34336;34400;32736;31424;34016 | 589824;590848;589824;589824;589824 |
185 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1280.00 | 384.00 | 45.60 | 33.35 | 13.87 | false | 0.455926;0.455365;0.455595;0.455008;0.458862 | 55488;55488;55488;55488;55488 | 1280;1280;1280;1280;1280 | 384;384;256;384;384 |
186 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 21.333 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 554.67 | 44.40 | 41.95 | 13.87 | false | 0.443425;0.445173;0.442700;0.443169;0.445398 | 55488;55488;55488;55488;55488 | 1664;0;0;0;5376 | 768;768;768;768;768 |
187 | InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 21.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 853.33 | 458.67 | 44.20 | 42.29 | 13.87 | false | 0.441185;0.441597;0.441985;0.441898;0.443657 | 55488;55488;55488;55488;55488 | 1024;768;768;768;10496 | 384;1952;416;288;576 |
188 | InceptionV3/InceptionV3/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 0.00 | 44.10 | 72.25 | 13.87 | false | 0.439862;0.441371;0.441919;0.440952;0.441957 | 55488;55488;55488;55488;55488 | 0;0;0;0;0 | 768;768;768;768;768 |
189 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 21.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 128.00 | 44.00 | 0.00 | 0.00 | true | 0.440770;0.439381;0.439694;0.439617;0.439461 | 0;0;0;0;0 | 0;0;0;0;0 | 128;128;128;128;5120 |
190 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 128.00 | 43.50 | 0.00 | 0.00 | true | 0.435276;0.434923;0.435134;0.434945;0.434486 | 0;0;0;0;0 | 0;0;0;0;0 | 128;128;128;128;128 |
191 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 85088.00 | 1152928.00 | 8.70 | 197.84 | 8164.35 | false | 0.086774;0.086386;0.086557;0.086736;0.086672 | 244930560;244930560;244930560;244930560;244930560 | 1151552;1140416;1160160;1179392;1147072 | 85984;88416;83040;80224;86240 |
191 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2496.00 | 4108032.00 | 16.30 | 3.19 | 937.40 | true | 0.160542;0.158061;0.161954;0.167487;0.167561 | 13123584;13123584;13123584;13123584;13123584 | 4117888;4108992;4095424;4097216;4127712 | 4416;2240;2496;2752;2240 |
191 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 13.00 | 1161984 | 227445.33 | 697258.67 | 2.00 | 1.26 | 89.38 | true | 0.019914;0.019916;0.019915;0.019914;0.019914 | 1161984;1161984;1161984;1161984;1161984 | 227360;227488;227488;227360;227488 | 699488;696032;699424;696320;684160 |
191 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 507157.33 | 44.10 | 0.00 | 0.00 | true | 0.444126;0.444191;0.443239;0.435569;0.435117 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 503200;511264;509152;509120;493600 |
191 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1090176 | 13760.00 | 312469.33 | 2.00 | 3.34 | 155.74 | true | 0.019866;0.019866;0.019868;0.019865;0.019865 | 1090176;1090176;1090176;1090176;1090176 | 13248;13760;13760;13760;13760 | 315968;313280;313568;308000;310560 |
192 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.33 | 244930560 | 204640.00 | 1127626.67 | 8.60 | 183.84 | 8074.72 | false | 0.086659;0.086521;0.086487;0.086448;0.086353 | 244930560;244930560;244930560;244930560;244930560 | 201952;177760;216416;222816;195552 | 1135712;1121664;1109440;1125504;1141344 |
192 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 1161984 | 229792.00 | 565109.33 | 2.00 | 1.46 | 74.17 | true | 0.019944;0.019938;0.019941;0.019944;0.019961 | 1161984;1161984;1161984;1161984;1161984 | 571136;562688;569120;553600;563520 | 229024;229792;229792;229792;229792 |
192 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.67 | 13123584 | 1226.67 | 4435349.33 | 16.10 | 2.96 | 894.77 | true | 0.156722;0.157940;0.159218;0.165909;0.164502 | 13123584;13123584;13123584;13123584;13123584 | 4413280;4436544;4431776;4457312;4437728 | 1312;736;1312;1184;1184 |
192 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032896.00 | 811968.00 | 44.40 | 0.00 | 0.00 | true | 0.454657;0.445915;0.446288;0.440212;0.439858 | 0;0;0;0;0 | 1032896;1032896;1034176;1032896;1032896 | 812768;808672;812096;811168;812640 |
192 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.33 | 1090176 | 13354.67 | 259893.33 | 2.00 | 3.99 | 172.14 | true | 0.019876;0.019873;0.019876;0.019873;0.019874 | 1090176;1090176;1090176;1090176;1090176 | 264128;267840;250752;257088;258464 | 12288;12224;14720;13824;13952 |
193 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 30.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 218602.67 | 68266.67 | 45.90 | 0.19 | 11.89 | true | 0.459655;0.458687;0.459460;0.459133;0.460107 | 55488;55488;55488;55488;55488 | 218176;218176;225600;219072;218560 | 65408;77696;79872;61696;60672 |
194 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 24.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1088.00 | 51626.67 | 45.20 | 1.05 | 13.87 | true | 0.451576;0.453947;0.451735;0.452016;0.451406 | 55488;55488;55488;55488;55488 | 55552;45440;44032;53888;55936 | 1088;1088;1088;1088;1088 |
195 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 1280.00 | 59776.00 | 45.70 | 0.00 | 0.00 | true | 0.456859;0.459447;0.456883;0.456010;0.456301 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 60160;59520;53376;61824;59648 |
196 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 192 17 17]] | 19 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 341.33 | 43.60 | 0.00 | 0.00 | true | 0.435448;0.435016;0.436284;0.435676;0.435919 | 0;0;0;0;0 | 0;0;0;6656;0 | 384;256;384;384;256 |
197 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.33 | 244930560 | 28128.00 | 1107968.00 | 8.70 | 215.59 | 8074.72 | false | 0.086584;0.086568;0.086567;0.086792;0.086681 | 244930560;244930560;244930560;244930560;244930560 | 1100192;1111392;1115008;1107616;1104896 | 30560;35040;24672;29152;24544 |
197 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222837.33 | 598005.33 | 2.00 | 1.42 | 77.47 | true | 0.019963;0.019941;0.019951;0.019971;0.019971 | 1161984;1161984;1161984;1161984;1161984 | 222624;222752;222880;222880;222880 | 586656;601952;622976;594688;597376 |
197 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 1578.67 | 4241600.00 | 16.20 | 3.09 | 937.40 | true | 0.160216;0.159258;0.159301;0.165577;0.165539 | 13123584;13123584;13123584;13123584;13123584 | 8192;1408;1152;1408;1920 | 4248960;4229568;4215072;4251744;4246272 |
197 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032960.00 | 861941.33 | 44.80 | 0.00 | 0.00 | true | 0.451641;0.446366;0.448456;0.447901;0.440885 | 0;0;0;0;0 | 1032960;1032960;1032960;1032960;1032960 | 866432;859872;877600;851456;859520 |
197 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1090176 | 1173.33 | 270368.00 | 2.00 | 4.01 | 233.59 | true | 0.019929;0.019933;0.019950;0.019961;0.019883 | 1090176;1090176;1090176;1090176;1090176 | 1088;1408;1344;896;1088 | 269152;265952;270624;274240;271328 |
198 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 244930560 | 388277.33 | 1126528.00 | 8.70 | 161.69 | 7900.99 | false | 0.086763;0.086886;0.086845;0.086862;0.086510 | 244930560;244930560;244930560;244930560;244930560 | 392288;360416;368352;433376;404192 | 1135136;1108768;1104192;1135680;1153504 |
198 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.67 | 13123584 | 1162.67 | 4473376.00 | 15.90 | 2.93 | 894.77 | true | 0.158508;0.158653;0.157490;0.161187;0.162815 | 13123584;13123584;13123584;13123584;13123584 | 1120;1120;1248;1632;1120 | 4467616;4431200;4460256;4514784;4492256 |
198 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 1161984 | 223818.67 | 573994.67 | 2.00 | 1.46 | 112.45 | true | 0.019913;0.019913;0.019913;0.019913;0.019912 | 1161984;1161984;1161984;1161984;1161984 | 223648;223904;223904;223904;223648 | 567936;576448;580512;558240;577600 |
198 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032896.00 | 937077.33 | 44.70 | 0.00 | 0.00 | true | 0.448273;0.447816;0.446027;0.444772;0.446060 | 0;0;0;0;0 | 1032896;1032896;1032896;1032896;1032896 | 950112;924352;939712;936128;935392 |
198 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.667 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1090176 | 14037.33 | 287317.33 | 2.00 | 3.62 | 181.70 | true | 0.019834;0.019830;0.019833;0.019827;0.019830 | 1090176;1090176;1090176;1090176;1090176 | 285056;300384;292448;284448;266528 | 14336;13952;12864;13824;16640 |
199 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 28.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 148416.00 | 33653.33 | 45.90 | 0.30 | 11.89 | true | 0.460006;0.459280;0.460272;0.458413;0.458633 | 55488;55488;55488;55488;55488 | 38560;76960;38816;23456;23584 | 148416;149312;147392;149312;147520 |
200 | InceptionV3/InceptionV3/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 1088.00 | 16000.00 | 45.20 | 3.25 | 12.81 | true | 0.451601;0.451742;0.459517;0.450843;0.451585 | 55488;55488;55488;55488;55488 | 15232;15360;17408;16896;15744 | 1088;1088;1088;1088;1088 |
201 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 1280.00 | 69952.00 | 45.70 | 0.00 | 0.00 | true | 0.457317;0.457273;0.457413;0.456972;0.456911 | 0;0;0;0;0 | 1280;1280;1280;1280;1280 | 72192;45952;69664;69760;70432 |
202 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 178 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.33 | 244930560 | 63882.67 | 1173376.00 | 8.70 | 197.96 | 8074.72 | false | 0.086781;0.086722;0.086881;0.086672;0.086646 | 244930560;244930560;244930560;244930560;244930560 | 1177728;1166624;1142144;1192224;1175776 | 62688;66144;65760;52832;63200 |
202 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 178 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 1365.33 | 4220458.67 | 16.20 | 3.11 | 937.40 | true | 0.160159;0.158726;0.159736;0.166300;0.165884 | 13123584;13123584;13123584;13123584;13123584 | 1152;1408;1408;1408;1280 | 4214304;4177312;4240032;4207040;4247904 |
202 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 178 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 221813.33 | 687061.33 | 2.00 | 1.28 | 105.63 | true | 0.020011;0.019969;0.019998;0.019997;0.019995 | 1161984;1161984;1161984;1161984;1161984 | 221472;220064;223520;222368;221600 | 681696;704032;701632;677856;674400 |
202 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 178 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032704.00 | 894901.33 | 44.60 | 0.00 | 0.00 | true | 0.450182;0.450872;0.446773;0.441487;0.439252 | 0;0;0;0;0 | 896064;904704;897952;890688;886720 | 1032704;1032704;1032704;1032704;1032704 |
202 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 178 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 13056.00 | 229909.33 | 2.00 | 4.49 | 218.04 | true | 0.019824;0.019849;0.019845;0.019858;0.019855 | 1090176;1090176;1090176;1090176;1090176 | 13696;14464;13696;11520;11776 | 229024;246048;233728;226976;222720 |
203 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1216.00 | 917.33 | 45.60 | 26.01 | 13.87 | false | 0.460270;0.455644;0.457170;0.454648;0.454249 | 55488;55488;55488;55488;55488 | 1216;1216;1216;1216;1216 | 800;1024;928;1152;768 |
204 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 192 17 17]] | 21 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 864.00 | 896.00 | 45.00 | 0.00 | 0.00 | true | 0.449319;0.449935;0.449945;0.450862;0.448812 | 0;0;0;0;0 | 864;864;864;864;864 | 768;1024;1024;896;768 |
205 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 189 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 134922.67 | 1115168.00 | 8.70 | 195.93 | 8164.35 | false | 0.086836;0.086567;0.086680;0.086530;0.086498 | 244930560;244930560;244930560;244930560;244930560 | 136032;138080;133088;126304;135648 | 1128064;1117440;1096448;1104608;1123456 |
205 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 189 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.67 | 1161984 | 223776.00 | 641877.33 | 2.00 | 1.34 | 79.22 | true | 0.019971;0.019932;0.019984;0.019951;0.019935 | 1161984;1161984;1161984;1161984;1161984 | 223776;224160;223904;223648;223392 | 642720;634816;650720;645440;637472 |
205 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 189 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 1450.67 | 4249333.33 | 16.10 | 3.09 | 937.40 | true | 0.161052;0.159461;0.158330;0.163864;0.164982 | 13123584;13123584;13123584;13123584;13123584 | 1792;1152;1792;1024;1408 | 4254560;4250400;4242368;4251872;4245728 |
205 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 189 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032704.00 | 834378.67 | 44.50 | 0.00 | 0.00 | true | 0.453247;0.445479;0.449940;0.438004;0.440254 | 0;0;0;0;0 | 831424;838912;831968;832256;843040 | 1032704;1032704;1032704;1032704;1032704 |
205 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 189 | 221952 | 7160064 | 96392192 | GPU_0_bfc | 6938112 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1090176 | 17941.33 | 232021.33 | 2.00 | 4.36 | 181.70 | true | 0.019852;0.019856;0.019851;0.019851;0.019852 | 1090176;1090176;1090176;1090176;1090176 | 17728;15136;17216;19264;18880 | 226848;223520;234880;240064;234336 |
206 | InceptionV3/InceptionV3/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27.667 | 221952 | 0 | 96170240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 1088.00 | 757.33 | 45.50 | 30.07 | 11.89 | false | 0.456888;0.455539;0.454797;0.454270;0.455368 | 55488;55488;55488;55488;55488 | 1088;1088;1088;1088;1088 | 800;672;928;672;800 |
208 | InceptionV3/InceptionV3/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 768 17 17]] | 24.667 | 1036288 | 0 | 96318720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 512.00 | 242005.33 | 65.70 | 0.00 | 0.00 | true | 0.656538;0.646353;0.657374;0.655700;0.657570 | 0;0;0;0;0 | 512;2560;512;512;512 | 242560;236800;244736;240128;243328 |
209 | InceptionV3/InceptionV3/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 768 8 8]] | 39.667 | 196608 | 196608 | 96515328 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 49152 | 7936.00 | 198538.67 | 21.90 | 0.24 | 7.02 | true | 0.218525;0.219810;0.218023;0.219405;0.219389 | 49152;49152;49152;49152;49152 | 7680;7936;7936;7936;7936 | 198496;198304;198912;198816;197952 |
210 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 131.667 | 221952 | 811776 | 96737280 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.33 | 94427328 | 10240.00 | 361376.00 | 3.10 | 254.10 | 2129.96 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 94427328;94427328;94427328;94427328;94427328 | 355520;378656;370528;350784;358080 | 9216;10240;10240;10240;10240 |
210 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 131.667 | 221952 | 811776 | 96737280 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 590506.67 | 404085.33 | 45.20 | 0.00 | 0.00 | true | 0.454669;0.450900;0.454050;0.452061;0.450210 | 0;0;0;0;0 | 590592;590336;590592;590592;590336 | 411712;389696;394816;408768;408672 |
211 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 96959232 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 94427328 | 256.00 | 101557.33 | 3.10 | 927.46 | 2360.68 | false | 0.031244;0.031245;0.031245;0.031245;0.031244 | 94427328;94427328;94427328;94427328;94427328 | 100320;100608;103872;103744;97696 | 256;256;256;256;256 |
211 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 125 | 221952 | 811776 | 96959232 | GPU_0_bfc | 589824 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 590080.00 | 194314.67 | 45.10 | 0.00 | 0.00 | true | 0.452518;0.452599;0.451460;0.450405;0.449171 | 0;0;0;0;0 | 590080;590080;590080;590080;590080 | 194976;193056;194432;193536;200064 |
212 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27 | 221952 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1280.00 | 384.00 | 45.50 | 33.35 | 13.87 | false | 0.455631;0.455933;0.455290;0.453503;0.454659 | 55488;55488;55488;55488;55488 | 1280;1280;1280;1280;1280 | 384;640;384;256;384 |
213 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22 | 221952 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 0.00 | 44.20 | 72.25 | 13.87 | false | 0.443234;0.442183;0.441878;0.442171;0.442834 | 55488;55488;55488;55488;55488 | 768;768;768;768;768 | 0;0;0;0;0 |
214 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.00 | 0.00 | 0.00 | true | 0.439619;0.439031;0.440003;0.439610;0.439994 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;512 |
215 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 1621.33 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.436830;0.436174;0.436186;0.436384;0.436220 | 0;0;0;0;0 | 0;4864;0;0;5120 | 0;0;0;0;0 |
216 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 188.667 | 221952 | 7155968 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 244930560 | 201034.67 | 1124416.00 | 8.60 | 184.79 | 7900.99 | false | 0.086167;0.086243;0.086287;0.086298;0.086353 | 244930560;244930560;244930560;244930560;244930560 | 200992;206880;182560;206112;196000 | 1132000;1121792;1119456;1154816;1104608 |
216 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 188.667 | 221952 | 7155968 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 224842.67 | 653013.33 | 2.00 | 1.32 | 77.47 | true | 0.019935;0.019957;0.019957;0.019966;0.019959 | 1161984;1161984;1161984;1161984;1161984 | 224672;224928;224672;224928;224928 | 652416;650048;656576;665088;632832 |
216 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 188.667 | 221952 | 7155968 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2218.67 | 3908266.67 | 16.60 | 3.36 | 937.40 | true | 0.165148;0.163295;0.162970;0.170980;0.171078 | 13123584;13123584;13123584;13123584;13123584 | 3900896;3909952;3913952;3860896;3951104 | 1920;2432;2176;2048;2560 |
216 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 188.667 | 221952 | 7155968 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 582954.67 | 44.20 | 0.00 | 0.00 | true | 0.443774;0.442584;0.447583;0.438872;0.436884 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 582752;589408;579360;586752;578272 |
216 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 188.667 | 221952 | 7155968 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 13760.00 | 347434.67 | 2.00 | 3.02 | 218.04 | true | 0.019867;0.019857;0.019867;0.019872;0.019868 | 1090176;1090176;1090176;1090176;1090176 | 12960;14208;13888;13184;14976 | 345248;347456;348960;347392;347456 |
217 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 212.333 | 81920 | 2293760 | 96004864 | GPU_0_bfc | 2211840 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 113.00 | 70799360 | 222976.00 | 88586.67 | 3.10 | 227.24 | 626.54 | false | 0.031250;0.031250;0.031250;0.031250;0.031250 | 70799360;70799360;70799360;70799360;70799360 | 222976;222976;222976;222976;222976 | 89632;88672;83936;87456;96480 |
217 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 212.333 | 81920 | 2293760 | 96004864 | GPU_0_bfc | 2211840 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2212544.00 | 1683637.33 | 45.80 | 0.00 | 0.00 | true | 0.462466;0.458419;0.464748;0.453724;0.453063 | 0;0;0;0;0 | 2212544;2212544;2212544;2212544;2212544 | 1676192;1693088;1707200;1678720;1679104 |
218 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27.333 | 221952 | 0 | 95782912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1536.00 | 64085.33 | 45.90 | 0.85 | 13.87 | true | 0.460400;0.454929;0.456303;0.461627;0.459676 | 55488;55488;55488;55488;55488 | 1536;1536;1536;1536;1536 | 64128;64128;64128;63872;64000 |
219 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 8 8]] | 21 | 81920 | 0 | 95782912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 20480 | 1280.00 | 18112.00 | 45.60 | 1.06 | 5.12 | true | 0.455226;0.456630;0.456527;0.455924;0.455035 | 20480;20480;20480;20480;20480 | 1280;1280;1280;1280;1280 | 18176;18080;18080;17920;18304 |
220 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 95782912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 768.00 | 1152.00 | 43.90 | 0.00 | 0.00 | true | 0.439108;0.439322;0.439326;0.439247;0.439166 | 0;0;0;0;0 | 768;768;768;768;768 | 1280;1152;1152;1152;1152 |
221 | InceptionV3/InceptionV3/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 320 8 8]] | 19.667 | 81920 | 0 | 95782912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 256.00 | 44.10 | 0.00 | 0.00 | true | 0.441117;0.441081;0.440964;0.441716;0.441198 | 0;0;0;0;0 | 256;256;256;256;256 | 0;0;0;0;0 |
222 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185 | 361984 | 7296000 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 28917.33 | 1047936.00 | 8.70 | 227.45 | 8164.35 | false | 0.086757;0.086602;0.086611;0.086427;0.086525 | 244930560;244930560;244930560;244930560;244930560 | 30112;26912;28384;28256;30752 | 1038912;1059328;1040032;1098784;1044448 |
222 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185 | 361984 | 7296000 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.67 | 13123584 | 832.00 | 3449674.67 | 17.00 | 3.80 | 960.24 | true | 0.168351;0.169715;0.170472;0.169781;0.170256 | 13123584;13123584;13123584;13123584;13123584 | 896;768;832;576;896 | 3474496;3441248;3459872;3377536;3447904 |
222 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185 | 361984 | 7296000 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 1161984 | 226122.67 | 650645.33 | 2.00 | 1.33 | 96.83 | true | 0.019915;0.019915;0.019916;0.019914;0.019914 | 1161984;1161984;1161984;1161984;1161984 | 226208;226080;226208;226080;225952 | 637184;657888;639264;667136;654784 |
222 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185 | 361984 | 7296000 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.33 | 0 | 1032192.00 | 211424.00 | 44.20 | 0.00 | 0.00 | true | 0.445659;0.441932;0.444312;0.438940;0.438622 | 0;0;0;0;0 | 1032192;1032192;1037056;1032192;1032192 | 216096;210592;211168;211328;211776 |
222 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185 | 361984 | 7296000 | 96144896 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.67 | 1090176 | 12384.00 | 265781.33 | 2.00 | 3.92 | 163.52 | true | 0.019858;0.019852;0.019854;0.019853;0.019851 | 1090176;1090176;1090176;1090176;1090176 | 266976;263232;266048;264320;274240 | 12416;12448;12928;11968;12288 |
223 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 27.667 | 361984 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 1216.00 | 640.00 | 45.60 | 29.90 | 13.87 | false | 0.456026;0.454883;0.455974;0.456468;0.457145 | 55488;55488;55488;55488;55488 | 1216;1216;1216;1216;1216 | 640;640;896;640;640 |
224 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu | Relu | [[1 192 17 17]] | 20 | 361984 | 0 | 95922944 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 608.00 | 405.33 | 44.90 | 0.00 | 0.00 | true | 0.449104;0.448759;0.449369;0.449113;0.448834 | 0;0;0;0;0 | 608;608;608;608;608 | 384;768;384;448;384 |
225 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 176 | 49152 | 1376256 | 95972096 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.00 | 42479616 | 1706.67 | 80202.67 | 3.10 | 518.62 | 493.95 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 42479616;42479616;42479616;42479616;42479616 | 1536;1792;1792;1536;1792 | 79584;92096;77312;78464;82560 |
225 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 176 | 49152 | 1376256 | 95972096 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1328128.00 | 1192938.67 | 44.80 | 0.00 | 0.00 | true | 0.451201;0.450615;0.451565;0.442856;0.441739 | 0;0;0;0;0 | 1194880;1177632;1194656;1190304;1193856 | 1328128;1328128;1328128;1329152;1328128 |
226 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 8 8]] | 26 | 49152 | 0 | 95610112 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12288 | 768.00 | 27104.00 | 45.80 | 0.44 | 3.07 | true | 0.454413;0.457117;0.458105;0.458557;0.457823 | 12288;12288;12288;12288;12288 | 1024;768;768;768;768 | 27296;26752;27104;27040;27168 |
227 | InceptionV3/InceptionV3/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 192 8 8]] | 23.333 | 49152 | 0 | 95610112 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.442511;0.441648;0.442480;0.442798;0.441763 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;64;0;0 |
229 | InceptionV3/InceptionV3/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1280 8 8]] | 42.667 | 327680 | 327680 | 95972096 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 1590875 | 7168.00 | 104256.00 | 33.60 | 14.28 | 198.86 | true | 0.333923;0.335829;0.336646;0.336343;0.335488 | 1590875;1590875;1590875;1590875;1590875 | 103744;104832;104448;103168;104576 | 6144;7424;7168;7168;7168 |
230 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 158.333 | 114688 | 2408448 | 96086784 | GPU_0_bfc | 2293760 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 73428992 | 0.00 | 179552.00 | 3.10 | 408.96 | 1147.33 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 73428992;73428992;73428992;73428992;73428992 | 0;0;0;0;0 | 186272;175488;181024;181152;176480 |
230 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 158.333 | 114688 | 2408448 | 96086784 | GPU_0_bfc | 2293760 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.33 | 0 | 2293760.00 | 1140938.67 | 45.60 | 0.00 | 0.00 | true | 0.460389;0.460173;0.456690;0.448123;0.451071 | 0;0;0;0;0 | 2293760;2293760;2293760;2295552;2293760 | 1134048;1144832;1146048;1128992;1143936 |
231 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 156.333 | 163840 | 2129920 | 96250624 | GPU_0_bfc | 1966080 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 62939136 | 0.00 | 842.67 | 3.10 | 74690.40 | 983.42 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 62939136;62939136;62939136;62939136;62939136 | 0;0;0;0;0 | 416;1312;672;800;1056 |
231 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 156.333 | 163840 | 2129920 | 96250624 | GPU_0_bfc | 1966080 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.67 | 0 | 1966080.00 | 214848.00 | 43.50 | 0.00 | 0.00 | true | 0.440621;0.437568;0.435960;0.432223;0.430050 | 0;0;0;0;0 | 1966080;1966080;1966080;1966080;1966592 | 214912;215040;213696;215488;214592 |
232 | InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 153.667 | 81920 | 1720320 | 96332544 | GPU_0_bfc | 1638400 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 65.00 | 52449280 | 0.00 | 0.00 | 3.10 | 0.00 | 806.91 | true | 0.031249;0.031249;0.031249;0.031249;0.031249 | 52449280;52449280;52449280;52449280;52449280 | 128;0;0;0;0 | 0;0;0;0;0 |
232 | InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 153.667 | 81920 | 1720320 | 96332544 | GPU_0_bfc | 1638400 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1638400.00 | 337173.33 | 45.10 | 0.00 | 0.00 | true | 0.455730;0.451819;0.451813;0.446339;0.448378 | 0;0;0;0;0 | 337152;336544;340096;336384;337824 | 1638400;1638400;1638400;1638400;1638400 |
233 | InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 168.667 | 49152 | 1032192 | 96019712 | GPU_0_bfc | 983040 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 83.00 | 31469568 | 328448.00 | 18176.00 | 3.10 | 90.79 | 379.15 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 31469568;31469568;31469568;31469568;31469568 | 329984;327680;327680;327680;330752 | 18304;18048;18688;18176;18048 |
233 | InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 168.667 | 49152 | 1032192 | 96019712 | GPU_0_bfc | 983040 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 983040.00 | 105130.67 | 43.60 | 0.00 | 0.00 | true | 0.440825;0.434751;0.438390;0.432261;0.433500 | 0;0;0;0;0 | 983040;983040;983040;983040;983040 | 105088;105856;104192;105856;104448 |
234 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 29.667 | 114688 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 28672 | 115669.33 | 0.00 | 46.40 | 0.25 | 6.14 | true | 0.464385;0.459882;0.463083;0.464523;0.465384 | 28672;28672;28672;28672;28672 | 115584;115712;115328;115968;115712 | 0;0;0;0;0 |
235 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 21.333 | 163840 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 170.67 | 45.40 | 14.40 | 6.14 | true | 0.453830;0.453277;0.455847;0.454863;0.454349 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 512;0;9728;0;0 |
236 | InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 320 8 8]] | 21 | 81920 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 20480 | 1280.00 | 0.00 | 45.50 | 16.00 | 5.12 | true | 0.455208;0.452331;0.455071;0.454472;0.455077 | 20480;20480;20480;20480;20480 | 1280;1280;1280;1280;1280 | 0;0;0;0;0 |
237 | InceptionV3/InceptionV3/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 8 8]] | 20.667 | 49152 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12288 | 768.00 | 0.00 | 45.30 | 16.00 | 3.07 | true | 0.450464;0.446550;0.456707;0.455038;0.454783 | 12288;12288;12288;12288;12288 | 768;768;768;5888;768 | 0;0;0;0;0 |
238 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 448 8 8]] | 20 | 114688 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 44.10 | 0.00 | 0.00 | true | 0.441346;0.441430;0.441649;0.441324;0.441574 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;512;0;0 |
239 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 21.333 | 163840 | 0 | 95692032 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 85.33 | 44.00 | 0.00 | 0.00 | true | 0.440270;0.440292;0.439886;0.440052;0.440632 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;4864;0;256 |
240 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 252 | 98304 | 23496192 | 95790336 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 101.00 | 183558144 | 11072672.00 | 3564768.00 | 12.50 | 12.54 | 1817.41 | true | 0.124978;0.124978;0.124978;0.124977;0.124977 | 183558144;183558144;183558144;183558144;183558144 | 3538400;3507296;3559904;3597792;3596000 | 11073952;11025440;11016224;11118624;11125152 |
240 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 252 | 98304 | 23496192 | 95790336 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 33.00 | 0 | 6415637.33 | 4543669.33 | 46.20 | 0.00 | 0.00 | true | 0.466040;0.464669;0.464999;0.455290;0.454887 | 0;0;0;0;0 | 6433728;6492928;6409024;6404160;6393216 | 4545472;4573088;4547712;4533856;4537824 |
240 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 252 | 98304 | 23496192 | 95790336 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 23.67 | 9977856 | 4847146.67 | 9914741.33 | 51.70 | 0.68 | 421.59 | true | 0.519113;0.516475;0.518203;0.515745;0.512834 | 9977856;9977856;9977856;9977856;9977856 | 4798016;4888320;4863488;4845248;4832704 | 9932064;9956352;9910688;9888160;9901472 |
241 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 168 | 114688 | 1884160 | 95790336 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.33 | 56647680 | 99008.00 | 426.67 | 3.10 | 569.70 | 762.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 99008;99008;99008;99008;99008 | 768;1664;256;256;0 |
241 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 168 | 114688 | 1884160 | 95790336 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1769920.00 | 8320.00 | 43.90 | 0.00 | 0.00 | true | 0.441080;0.440186;0.440707;0.434044;0.434878 | 0;0;0;0;0 | 1769920;1769920;1769920;1769920;1769920 | 26496;10112;1408;8832;6016 |
242 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 144.667 | 98304 | 1867776 | 95888640 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 56647680 | 0.00 | 63104.00 | 3.10 | 897.69 | 965.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 0;2048;0;0;0 | 63104;63104;63104;63104;63104 |
242 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 144.667 | 98304 | 1867776 | 95888640 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1769472.00 | 2432.00 | 43.70 | 0.00 | 0.00 | true | 0.439307;0.439234;0.438188;0.431939;0.433194 | 0;0;0;0;0 | 1769472;1769472;1769472;1769472;1769472 | 2176;2432;2432;2432;2432 |
243 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 29.667 | 98304 | 0 | 95724800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 4608.00 | 0.00 | 46.30 | 5.33 | 6.14 | true | 0.463844;0.461958;0.463259;0.464107;0.461429 | 24576;24576;24576;24576;24576 | 128;0;0;0;0 | 3840;3712;6272;9088;3712 |
244 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 21 | 114688 | 0 | 95724800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 0.00 | 45.40 | 16.00 | 6.14 | true | 0.452868;0.454345;0.452426;0.453912;0.453743 | 24576;24576;24576;24576;24576 | 128;0;0;0;0 | 1536;1536;1536;1536;1536 |
245 | InceptionV3/InceptionV3/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 20.667 | 98304 | 0 | 95724800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 32.00 | 45.40 | 15.67 | 6.14 | true | 0.452113;0.454040;0.455348;0.454803;0.453668 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 32;32;32;32;32 |
246 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 384 8 8]] | 20.667 | 98304 | 0 | 95724800 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 512.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.442337;0.442022;0.442320;0.441810;0.442094 | 0;0;0;0;0 | 0;0;0;0;0 | 512;512;512;512;512 |
247 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 145 | 163840 | 1933312 | 95888640 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.33 | 56647680 | 0.00 | 42.67 | 3.10 | 1327669.63 | 971.11 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 0;0;0;0;0 | 128;768;0;0;0 |
247 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 145 | 163840 | 1933312 | 95888640 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1769536.00 | 576.00 | 43.60 | 0.00 | 0.00 | true | 0.436843;0.438874;0.439344;0.432417;0.432656 | 0;0;0;0;0 | 1769536;1769536;1769536;1769536;1769536 | 576;608;576;576;576 |
248 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 146 | 98304 | 1867776 | 95986944 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 56647680 | 85.33 | 26794.67 | 3.10 | 2107.43 | 965.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 2048;0;0;0;256 | 27136;26624;26624;26624;27264 |
248 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 146 | 98304 | 1867776 | 95986944 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1769472.00 | 31744.00 | 43.60 | 0.00 | 0.00 | true | 0.436669;0.439578;0.439963;0.432762;0.430329 | 0;0;0;0;0 | 1769472;1769472;1769472;1769472;1769472 | 31616;31744;31744;31744;31744 |
249 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 26.667 | 163840 | 0 | 95888640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 768.00 | 45.70 | 10.67 | 6.14 | true | 0.457607;0.457394;0.457508;0.456368;0.457091 | 24576;24576;24576;24576;24576 | 768;768;768;768;896 | 1536;6400;1536;1536;1536 |
250 | InceptionV3/InceptionV3/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 21 | 98304 | 0 | 95888640 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 42.67 | 45.40 | 15.57 | 6.14 | true | 0.452051;0.453815;0.454424;0.454337;0.452945 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 128;0;0;0;128 |
252 | InceptionV3/InceptionV3/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 2048 8 8]] | 25 | 524288 | 0 | 95806720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 63.30 | 0.00 | 0.00 | true | 0.632345;0.632581;0.632806;0.633501;0.630843 | 0;0;0;0;0 | 128;0;0;0;0 | 0;0;0;0;0 |
253 | InceptionV3/InceptionV3/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 2048 8 8]] | 44 | 524288 | 524288 | 96331008 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.00 | 1974668 | 256.00 | 202.67 | 44.40 | 4305.23 | 219.41 | false | 0.443307;0.447290;0.443343;0.444101;0.444079 | 1974668;1974668;1974668;1974668;1974668 | 352;128;128;256;224 | 256;256;256;256;256 |
254 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 242 | 114688 | 3784704 | 96445696 | GPU_0_bfc | 3670016 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 139.00 | 117469184 | 1481258.67 | 537493.33 | 3.10 | 58.19 | 845.10 | false | 0.031250;0.031250;0.031250;0.031250;0.031250 | 117469184;117469184;117469184;117469184;117469184 | 1479936;1479040;1493504;1484800;1475072 | 536896;542016;545472;533568;532416 |
254 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 242 | 114688 | 3784704 | 96445696 | GPU_0_bfc | 3670016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 21.67 | 0 | 3670464.00 | 2074144.00 | 42.90 | 0.00 | 0.00 | true | 0.432365;0.431259;0.430278;0.426515;0.423206 | 0;0;0;0;0 | 2062112;2073376;2070976;2080992;2078080 | 3672064;3670080;3670272;3670016;3671040 |
255 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 236 | 98304 | 3244032 | 96544000 | GPU_0_bfc | 3145728 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 138.00 | 100687872 | 780053.33 | 486528.00 | 3.10 | 79.50 | 729.62 | false | 0.031250;0.031250;0.031250;0.031250;0.031249 | 100687872;100687872;100687872;100687872;100687872 | 772672;784320;782144;776640;781376 | 484192;482688;486080;489312;490080 |
255 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 236 | 98304 | 3244032 | 96544000 | GPU_0_bfc | 3145728 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 19.00 | 0 | 3146048.00 | 2233258.67 | 44.90 | 0.00 | 0.00 | true | 0.455707;0.452604;0.451283;0.441247;0.443466 | 0;0;0;0;0 | 3146048;3146048;3146048;3146048;3146048 | 2231648;2239296;2236064;2232064;2231424 |
256 | InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 202.333 | 81920 | 2703360 | 96625920 | GPU_0_bfc | 2621440 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 122.67 | 83906560 | 223061.33 | 416501.33 | 3.10 | 131.19 | 684.02 | false | 0.031250;0.031250;0.031250;0.031249;0.031250 | 83906560;83906560;83906560;83906560;83906560 | 222976;222592;224000;223616;221696 | 419488;416640;420768;411264;413376 |
256 | InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 202.333 | 81920 | 2703360 | 96625920 | GPU_0_bfc | 2621440 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 2621504.00 | 1025642.67 | 44.60 | 0.00 | 0.00 | true | 0.448284;0.449095;0.448973;0.440733;0.438472 | 0;0;0;0;0 | 1019552;1020448;1027296;1029184;1030592 | 2621504;2621504;2621504;2621504;2621504 |
257 | InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 222 | 49152 | 1622016 | 96150784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 131.00 | 50343936 | 524288.00 | 46944.00 | 3.10 | 88.13 | 384.30 | false | 0.031250;0.031249;0.031250;0.031250;0.031250 | 50343936;50343936;50343936;50343936;50343936 | 47456;46432;47456;46944;46016 | 524288;524288;529152;524288;524288 |
257 | InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 222 | 49152 | 1622016 | 96150784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 716213.33 | 44.50 | 0.00 | 0.00 | true | 0.447298;0.450078;0.444250;0.441699;0.443878 | 0;0;0;0;0 | 1572864;1573888;1572864;1572864;1572864 | 708320;719488;719168;715584;713888 |
258 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 27.333 | 114688 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 28672 | 116736.00 | 85.33 | 46.30 | 0.25 | 6.14 | true | 0.464330;0.460501;0.461298;0.464623;0.464841 | 28672;28672;28672;28672;28672 | 116736;116736;116736;116736;116736 | 0;0;128;128;128 |
259 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 20.667 | 98304 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 15872.00 | 74890.67 | 45.30 | 0.27 | 6.14 | true | 0.451613;0.452081;0.453341;0.456712;0.454998 | 24576;24576;24576;24576;24576 | 16384;14720;15616;16896;15616 | 74976;74400;74912;76832;74784 |
260 | InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 320 8 8]] | 22.333 | 81920 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 20480 | 1280.00 | 56746.67 | 45.60 | 0.35 | 5.12 | true | 0.452128;0.457277;0.455989;0.454959;0.455619 | 20480;20480;20480;20480;20480 | 1280;1280;1280;1280;1280 | 56960;56960;56576;56320;56704 |
261 | InceptionV3/InceptionV3/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 8 8]] | 19.667 | 49152 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 12288 | 768.00 | 45994.67 | 45.60 | 0.26 | 2.84 | true | 0.455647;0.455058;0.455997;0.457034;0.457097 | 12288;12288;12288;12288;12288 | 768;768;768;768;768 | 48128;48256;32128;41600;48256 |
262 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 448 8 8]] | 20.667 | 114688 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 58112.00 | 44.10 | 0.00 | 0.00 | true | 0.440856;0.440310;0.441381;0.441363;0.440444 | 0;0;0;0;0 | 0;0;0;0;0 | 58112;58112;57920;58112;58112 |
263 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.667 | 98304 | 0 | 95626496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 22432.00 | 44.20 | 0.00 | 0.00 | true | 0.441913;0.442111;0.441433;0.442205;0.442081 | 0;0;0;0;0 | 0;0;0;0;0 | 22528;22272;22496;22464;22336 |
264 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 242.667 | 98304 | 23496192 | 95724800 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 94.67 | 183558144 | 11052405.33 | 3453493.33 | 12.50 | 12.65 | 1938.99 | true | 0.124971;0.124971;0.124970;0.124971;0.124971 | 183558144;183558144;183558144;183558144;183558144 | 11048352;11015328;11058592;11080096;11050272 | 3472352;3413472;3460448;3440608;3459424 |
264 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 242.667 | 98304 | 23496192 | 95724800 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 33.00 | 0 | 6377109.33 | 4802560.00 | 46.30 | 0.00 | 0.00 | true | 0.464986;0.466740;0.467747;0.457612;0.457626 | 0;0;0;0;0 | 6418624;6373504;6394880;6359360;6362944 | 4831840;4799200;4802656;4802432;4802592 |
264 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 242.667 | 98304 | 23496192 | 95724800 | GPU_0_bfc | 23397888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 23.00 | 9977856 | 4991061.33 | 10074592.00 | 52.10 | 0.66 | 433.82 | true | 0.522630;0.522446;0.526094;0.519046;0.517435 | 9977856;9977856;9977856;9977856;9977856 | 10014720;10101920;10067456;10093984;10062336 | 5005440;4967040;5002432;4963584;5003712 |
265 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 167.333 | 98304 | 1867776 | 95708416 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.00 | 56647680 | 98581.33 | 810.67 | 3.10 | 569.94 | 765.51 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 98496;98496;98752;98752;98496 | 896;1664;512;768;768 |
265 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 167.333 | 98304 | 1867776 | 95708416 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1769920.00 | 17877.33 | 43.70 | 0.00 | 0.00 | true | 0.438853;0.438337;0.438754;0.431044;0.433635 | 0;0;0;0;0 | 1769920;1769920;1769920;1769920;1769920 | 20224;27648;17664;15360;15744 |
266 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 142.667 | 98304 | 1867776 | 95806720 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 56647680 | 0.00 | 0.00 | 3.10 | 0.00 | 965.58 | true | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 0;0;0;0;0 | 0;0;0;512;0 |
266 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 142.667 | 98304 | 1867776 | 95806720 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1769472.00 | 66688.00 | 43.70 | 0.00 | 0.00 | true | 0.440047;0.438269;0.438610;0.433656;0.434588 | 0;0;0;0;0 | 1769472;1769472;1769472;1769472;1769472 | 66816;66304;66944;65664;66944 |
267 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 28.333 | 98304 | 0 | 95708416 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 22442.67 | 0.00 | 46.40 | 1.10 | 6.14 | true | 0.464053;0.464320;0.461670;0.465296;0.464006 | 24576;24576;24576;24576;24576 | 22528;22656;21888;23040;22144 | 0;0;0;0;512 |
268 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 20.667 | 98304 | 0 | 95708416 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 0.00 | 45.40 | 16.00 | 6.14 | true | 0.452922;0.455072;0.447813;0.454283;0.453739 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;0;0;0;0 |
269 | InceptionV3/InceptionV3/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 21.667 | 98304 | 0 | 95708416 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 24576 | 1536.00 | 0.00 | 45.40 | 16.00 | 6.70 | true | 0.452816;0.454537;0.453757;0.454654;0.453376 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;0;0;0;0 |
270 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 384 8 8]] | 19.667 | 98304 | 0 | 95708416 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.441629;0.442303;0.442104;0.441962;0.441629 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
271 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 141.667 | 98304 | 1867776 | 95806720 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 56647680 | 0.00 | 0.00 | 3.10 | 0.00 | 976.68 | true | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 0;0;0;0;2048 | 0;0;32;0;0 |
271 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 141.667 | 98304 | 1867776 | 95806720 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.67 | 0 | 1769536.00 | 736.00 | 43.70 | 0.00 | 0.00 | true | 0.438090;0.442463;0.438217;0.434129;0.429616 | 0;0;0;0;0 | 1769536;1769536;1769536;1769536;1769536 | 736;736;736;768;736 |
272 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 142 | 98304 | 1867776 | 95905024 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 56647680 | 0.00 | 42.67 | 3.10 | 1327669.63 | 965.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 56647680;56647680;56647680;56647680;56647680 | 0;0;0;0;0 | 128;0;4096;0;0 |
272 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 142 | 98304 | 1867776 | 95905024 | GPU_0_bfc | 1769472 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1769472.00 | 25909.33 | 43.80 | 0.00 | 0.00 | true | 0.440800;0.440179;0.440393;0.434038;0.434774 | 0;0;0;0;0 | 25952;25952;25888;25888;25888 | 1769472;1769472;1769472;1769472;1769472 |
273 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 26.333 | 98304 | 0 | 95806720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 128.00 | 45.70 | 14.77 | 6.14 | true | 0.457857;0.456555;0.456918;0.457317;0.457026 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;3328 | 256;128;128;128;128 |
274 | InceptionV3/InceptionV3/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 24 | 98304 | 0 | 95806720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 32.00 | 45.20 | 15.67 | 6.14 | true | 0.451123;0.452507;0.450959;0.454264;0.453146 | 24576;24576;24576;24576;24576 | 1536;1536;2560;1536;1536 | 32;32;32;32;32 |
276 | InceptionV3/InceptionV3/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 2048 8 8]] | 24 | 524288 | 0 | 95806720 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 63.40 | 0.00 | 0.00 | true | 0.632672;0.633086;0.634215;0.633942;0.634213 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
277 | InceptionV3/Logits/AvgPool_1a_8x8/AvgPool | AvgPool | [[1 2048 1 1]] | 41.667 | 8192 | 8192 | 95814912 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 178041 | 2816.00 | 0.00 | 12.30 | 63.22 | 25.43 | false | 0.122533;0.122584;0.122568;0.122498;0.122520 | 178041;178041;178041;178041;178041 | 0;0;0;0;0 | 2816;2816;2816;2816;2816 |
278 | InceptionV3/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 246 | 4096 | 8204288 | 95294720 | GPU_0_bfc | 8200192 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 120.67 | 134218729 | 6629397.33 | 2212565.33 | 3.10 | 15.18 | 1112.31 | true | 0.031249;0.031249;0.031249;0.031249;0.031249 | 134218729;134218729;134218729;134218729;134218729 | 6635456;6625728;6636864;6627008;6620224 | 2211776;2208800;2217120;2207488;2218112 |
278 | InceptionV3/Logits/Conv2d_1c_1x1/convolution | Conv2D | [[1 1001 1 1]] | 246 | 4096 | 8204288 | 95294720 | GPU_0_bfc | 8200192 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 44.67 | 0 | 8323669.33 | 5842986.67 | 44.00 | 0.00 | 0.00 | true | 0.446372;0.442276;0.444944;0.433600;0.433268 | 0;0;0;0;0 | 8323840;8323776;8323520;8323712;8323392 | 5841920;5843680;5843360;5847584;5829088 |
279 | InceptionV3/Logits/Conv2d_1c_1x1/BiasAdd | BiasAdd | [[1 1001 1 1]] | 30.667 | 4096 | 0 | 95286528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 4.00 | 1001 | 6304.00 | 128.00 | 47.60 | 0.16 | 0.25 | true | 0.476590;0.475535;0.475832;0.479330;0.472516 | 1001;1001;1001;1001;1001 | 6304;6304;6304;7328;6304 | 128;128;128;128;128 |
283 | InceptionV3/Predictions/Softmax | Softmax | [[1 1001]] | 63.333 | 4096 | 8192 | 95286528 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 8.00 | 10431 | 6720.00 | 0.00 | 2.40 | 1.55 | 1.30 | true | 0.024193;0.023901;0.024001;0.023838;0.023883 | 10431;10431;10431;10431;10431 | 6720;6720;6720;6720;6720 | 0;0;0;0;0 |
283 | InceptionV3/Predictions/Softmax | Softmax | [[1 1001]] | 63.333 | 4096 | 8192 | 95286528 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 4.67 | 0 | 4288.00 | 85.33 | 4.00 | 0.00 | 0.00 | true | 0.039027;0.041318;0.040174;0.040073;0.040128 | 0;0;0;0;0 | 4288;4288;4288;4288;4288 | 128;128;0;128;0 |
283 | InceptionV3/Predictions/Softmax | Softmax | [[1 1001]] | 63.333 | 4096 | 8192 | 95286528 | GPU_0_bfc | 8192 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 3.67 | 24024 | 2304.00 | 640.00 | 6.20 | 8.16 | 6.55 | true | 0.062256;0.062258;0.062259;0.062259;0.062272 | 24024;24024;24024;24024;24024 | 640;768;640;384;640 | 2304;2304;2304;2560;2304 |
Showing 1 to 456 of 456 entries