ELF> @(@8@ĻĻ HH H  xx x  $$Ptd \\QtdRtdHH H GNU ݬMWP,cyxz (r@  5 8 Vp%b  Kt "3]L?dJ3 |__gmon_start____cxa_finalize_Jv_RegisterClasses__cudaRegisterFatBinary__cudaRegisterFunction__cudaRegisterVarcudaSetupArgumentcudaLaunch__cudaUnregisterFatBinaryceilfcudaMalloccudaMemcpycudaMemcpyToSymbolcudaThreadSynchronizecudaConfigureCallcudaFreecudaGetErrorStringstderr__fprintf_chkmexFunctionmexErrMsgTxtmxGetDatamxGetNmxGetMmxGetScalarmxCreateNumericMatrix_700mxIsSingle__gxx_personality_v0libcudart.so.4libcublas.so.4libmx.solibmex.solibmat.solibm.so.6libstdc++.so.6libpthread.so.0libc.so.6__cxa_atexitdense_distance_gpu_large32x16.mexa64MEXCXXABI_1.3GLIBC_2.2.5GLIBC_2.3.4v%QJ ӯkN ui Yui Yti eP    ȿ ؿ       п       (  0  8  @  H  P X ` h p x          HJH5 % @% h% h% h% h%z h%r h%j h%b hp%Z h`%R h P%J h @%B h 0%: h %2 h %* h%" h% h% h% h% h% h% h% h% hp%ڲ h`HH HtHÐU=ز HATSubH= t H= H+ L% H L)HHH9s HH AH H9rn [A\UH=׮ HtH HtH= ÐH= H(H=YH" - H H5 H= E1AHD$HHD$HD$H$H ݧH5 H= E1AHD$HHD$HD$H$KH ħH5- H=v E1AHD$HHD$HD$H$H H5 H=- E1AHD$HHD$HD$H$H H5 H= E1AHD$HHD$HD$H$pH iH= H5 AE1D$H$H 8H=m H5r AE1D$H$H H=8 H5A AE1D$H$H ۨH= H5 AE1D$H$|H(H=ɯ AW*AIAVEAUEATAIULSHYDT$(DD$4L$8L$ t$7H;HD$*I>HD$I>AH;H;IE9t H=ߠI~!D,AI~ D,Av H=1ɺIHHT$H|$EىIDt$D,$%H\$(Hl$0Ld$8Ll$@Lt$HL|$PHXfH=TmH9sI~aH(H|$H|$Ht$HT$1" .file 2 "/tmp/tmpxft_00006296_00000000-6_dense_distance_gpu_large32x16.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.5/include/stddef.h" .file 4 "/usr/local/cuda/include/crt/device_runtime.h" .file 5 "/usr/local/cuda/include/host_defines.h" .file 6 "/usr/local/cuda/include/builtin_types.h" .file 7 "/usr/local/cuda/include/device_types.h" .file 8 "/usr/local/cuda/include/driver_types.h" .file 9 "/usr/local/cuda/include/surface_types.h" .file 10 "/usr/local/cuda/include/texture_types.h" .file 11 "/usr/local/cuda/include/vector_types.h" .file 12 "/usr/local/cuda/include/device_launch_parameters.h" .file 13 "/usr/local/cuda/include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "dense_distance_gpu_large32x16.cu" .file 17 "/usr/local/cuda/include/common_functions.h" .file 18 "/usr/local/cuda/include/math_functions.h" .file 19 "/usr/local/cuda/include/math_constants.h" .file 20 "/usr/local/cuda/include/device_functions.h" .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" .file 26 "/usr/local/cuda/include/surface_functions.h" .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" .const .s32 n1C; .const .s32 n2C; .const .s32 blockItC; .const .s32 descDimC; .entry _Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_ ( .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__F1D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__F2D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__DISTSD) { .reg .u32 %r<48>; .reg .u64 %rd<32>; .reg .f32 %f<16>; .reg .pred %p<13>; .shared .align 4 .b8 __cuda___cuda_local_var_45591_41_non_const_F2Ds24[2176]; .shared .align 4 .b8 __cuda___cuda_local_var_45590_41_non_const_F1Ds2200[2176]; .loc 16 16 0 $LDWbegin__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_: mov.u32 %r1, %ctaid.y; mul.lo.u32 %r2, %r1, 32; ld.const.s32 %r3, [n2C]; mov.u32 %r4, %tid.x; ld.const.s32 %r5, [descDimC]; mov.u32 %r6, 0; setp.le.s32 %p1, %r5, %r6; @%p1 bra $Lt_0_13314; mov.u64 %rd1, __cuda___cuda_local_var_45591_41_non_const_F2Ds24; mov.u64 %rd2, __cuda___cuda_local_var_45590_41_non_const_F1Ds2200; mov.u32 %r7, %ctaid.x; mul.lo.u32 %r8, %r7, 32; ld.const.s32 %r9, [blockItC]; mul.lo.s32 %r10, %r9, 1024; add.s32 %r11, %r10, %r8; add.s32 %r12, %r5, 15; shr.s32 %r13, %r12, 31; mov.s32 %r14, 15; and.b32 %r15, %r13, %r14; add.s32 %r16, %r15, %r12; shr.s32 %r17, %r16, 4; mov.u32 %r18, %tid.y; mov.s32 %r19, %r18; add.u32 %r20, %r5, %r18; cvt.u64.u32 %rd3, %r18; add.u32 %r21, %r2, %r4; cvt.u64.u32 %rd4, %r4; mul.wide.u32 %rd5, %r4, 17; setp.gt.u32 %p2, %r3, %r21; mov.pred %p3, %p2; mov.pred %p4, %p5; add.u32 %r22, %r11, %r4; selp.s32 %r23, 1, 0, %p3; ld.const.s32 %r24, [n1C]; setp.gt.u32 %p6, %r24, %r22; add.u64 %rd6, %rd3, %rd5; mul.lo.u64 %rd7, %rd6, 4; selp.s32 %r25, 1, 0, %p6; mul.wide.u32 %rd8, %r18, 68; add.u64 %rd9, %rd2, %rd8; add.u64 %rd10, %rd7, %rd2; add.u64 %rd11, %rd7, %rd1; mul.lo.u64 %rd12, %rd5, 4; add.u64 %rd13, %rd1, %rd12; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 mov.s32 %r26, %r17; $Lt_0_10242: // Loop body line 16, nesting depth: 1, estimated iterations: unknown setp.gt.u32 %p7, %r5, %r19; selp.s32 %r27, 1, 0, %p7; and.b32 %r28, %r27, %r25; mov.u32 %r29, 0; setp.eq.s32 %p8, %r28, %r29; @%p8 bra $Lt_0_10754; .loc 16 26 0 ld.param.u64 %rd14, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__F1D]; mul.lo.u32 %r30, %r24, %r19; add.u32 %r31, %r22, %r30; cvt.u64.u32 %rd15, %r31; mul.wide.u32 %rd16, %r31, 4; add.u64 %rd17, %rd14, %rd16; ld.global.f32 %f3, [%rd17+0]; st.volatile.shared.f32 [%rd10+0], %f3; bra.uni $Lt_0_10498; $Lt_0_10754: .loc 16 28 0 mov.f32 %f4, 0f00000000; // 0 st.volatile.shared.f32 [%rd10+0], %f4; $Lt_0_10498: .loc 16 30 0 bar.sync 0; and.b32 %r32, %r27, %r23; mov.u32 %r33, 0; setp.eq.s32 %p9, %r32, %r33; @%p9 bra $Lt_0_11266; .loc 16 33 0 ld.param.u64 %rd18, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__F2D]; mul.lo.u32 %r34, %r3, %r19; add.u32 %r35, %r21, %r34; cvt.u64.u32 %rd19, %r35; mul.wide.u32 %rd20, %r35, 4; add.u64 %rd21, %rd18, %rd20; ld.global.f32 %f5, [%rd21+0]; st.volatile.shared.f32 [%rd11+0], %f5; bra.uni $Lt_0_11010; $Lt_0_11266: .loc 16 35 0 mov.f32 %f6, 0f00000000; // 0 st.volatile.shared.f32 [%rd11+0], %f6; $Lt_0_11010: .loc 16 37 0 bar.sync 0; .loc 16 24 0 mov.s64 %rd22, %rd9; mov.s64 %rd23, %rd13; mov.s32 %r36, 0; $Lt_0_12034: .pragma "nounroll"; // Loop body line 24, nesting depth: 2, iterations: 16 .loc 16 49 0 ld.volatile.shared.f32 %f7, [%rd22+0]; ld.volatile.shared.f32 %f8, [%rd23+0]; sub.f32 %f9, %f7, %f8; .loc 16 50 0 abs.f32 %f10, %f9; max.f32 %f2, %f2, %f10; .loc 16 51 0 ld.volatile.shared.f32 %f11, [%rd22+1088]; ld.volatile.shared.f32 %f12, [%rd23+0]; sub.f32 %f13, %f11, %f12; .loc 16 52 0 abs.f32 %f14, %f13; max.f32 %f1, %f1, %f14; add.s32 %r36, %r36, 1; add.u64 %rd23, %rd23, 4; add.u64 %rd22, %rd22, 4; mov.u32 %r37, 16; setp.ne.s32 %p10, %r36, %r37; @%p10 bra $Lt_0_12034; .loc 16 87 0 bar.sync 0; add.u32 %r19, %r19, 16; setp.lt.s32 %p11, %r19, %r20; @%p11 bra $Lt_0_10242; bra.uni $Lt_0_9730; $Lt_0_13314: add.u32 %r21, %r2, %r4; setp.gt.u32 %p2, %r3, %r21; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 $Lt_0_9730: .loc 16 91 0 @!%p2 bra $Lt_0_12802; .loc 16 95 0 mov.u32 %r38, %ctaid.x; mul.lo.u32 %r39, %r38, 32; mov.u32 %r40, %tid.y; add.u32 %r41, %r40, %r39; ld.param.u64 %rd24, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__DISTSD]; mul.lo.u32 %r42, %r3, %r41; add.u32 %r43, %r21, %r42; cvt.u64.u32 %rd25, %r43; mul.wide.u32 %rd26, %r43, 4; add.u64 %rd27, %rd24, %rd26; st.global.f32 [%rd27+0], %f2; .loc 16 97 0 add.u32 %r44, %r41, 16; mul.lo.u32 %r45, %r3, %r44; add.u32 %r46, %r21, %r45; cvt.u64.u32 %rd28, %r46; mul.wide.u32 %rd29, %r46, 4; add.u64 %rd30, %rd24, %rd29; st.global.f32 [%rd30+0], %f1; $Lt_0_12802: .loc 16 99 0 exit; $LDWend__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_: } // _Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_ .entry _Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0_ ( .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__F1D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__F2D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__DISTSD) { .reg .u32 %r<48>; .reg .u64 %rd<32>; .reg .f32 %f<16>; .reg .pred %p<13>; .shared .align 4 .b8 __cuda___cuda_local_var_45591_41_non_const_F2Ds4400[2176]; .shared .align 4 .b8 __cuda___cuda_local_var_45590_41_non_const_F1Ds6576[2176]; .loc 16 16 0 $LDWbegin__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0_: mov.u32 %r1, %ctaid.y; mul.lo.u32 %r2, %r1, 32; ld.const.s32 %r3, [n2C]; mov.u32 %r4, %tid.x; ld.const.s32 %r5, [descDimC]; mov.u32 %r6, 0; setp.le.s32 %p1, %r5, %r6; @%p1 bra $Lt_1_13570; mov.u64 %rd1, __cuda___cuda_local_var_45591_41_non_const_F2Ds4400; mov.u64 %rd2, __cuda___cuda_local_var_45590_41_non_const_F1Ds6576; mov.u32 %r7, %ctaid.x; mul.lo.u32 %r8, %r7, 32; ld.const.s32 %r9, [blockItC]; mul.lo.s32 %r10, %r9, 1024; add.s32 %r11, %r10, %r8; add.s32 %r12, %r5, 15; shr.s32 %r13, %r12, 31; mov.s32 %r14, 15; and.b32 %r15, %r13, %r14; add.s32 %r16, %r15, %r12; shr.s32 %r17, %r16, 4; mov.u32 %r18, %tid.y; mov.s32 %r19, %r18; add.u32 %r20, %r5, %r18; cvt.u64.u32 %rd3, %r18; add.u32 %r21, %r2, %r4; cvt.u64.u32 %rd4, %r4; mul.wide.u32 %rd5, %r4, 17; setp.gt.u32 %p2, %r3, %r21; mov.pred %p3, %p2; mov.pred %p4, %p5; add.u32 %r22, %r11, %r4; selp.s32 %r23, 1, 0, %p3; ld.const.s32 %r24, [n1C]; setp.gt.u32 %p6, %r24, %r22; add.u64 %rd6, %rd3, %rd5; mul.lo.u64 %rd7, %rd6, 4; selp.s32 %r25, 1, 0, %p6; mul.wide.u32 %rd8, %r18, 68; add.u64 %rd9, %rd2, %rd8; add.u64 %rd10, %rd7, %rd2; add.u64 %rd11, %rd7, %rd1; mul.lo.u64 %rd12, %rd5, 4; add.u64 %rd13, %rd1, %rd12; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 mov.s32 %r26, %r17; $Lt_1_10498: // Loop body line 16, nesting depth: 1, estimated iterations: unknown setp.gt.u32 %p7, %r5, %r19; selp.s32 %r27, 1, 0, %p7; and.b32 %r28, %r27, %r25; mov.u32 %r29, 0; setp.eq.s32 %p8, %r28, %r29; @%p8 bra $Lt_1_11010; .loc 16 26 0 ld.param.u64 %rd14, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__F1D]; mul.lo.u32 %r30, %r24, %r19; add.u32 %r31, %r22, %r30; cvt.u64.u32 %rd15, %r31; mul.wide.u32 %rd16, %r31, 4; add.u64 %rd17, %rd14, %rd16; ld.global.f32 %f3, [%rd17+0]; st.volatile.shared.f32 [%rd10+0], %f3; bra.uni $Lt_1_10754; $Lt_1_11010: .loc 16 28 0 mov.f32 %f4, 0f00000000; // 0 st.volatile.shared.f32 [%rd10+0], %f4; $Lt_1_10754: .loc 16 30 0 bar.sync 0; and.b32 %r32, %r27, %r23; mov.u32 %r33, 0; setp.eq.s32 %p9, %r32, %r33; @%p9 bra $Lt_1_11522; .loc 16 33 0 ld.param.u64 %rd18, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__F2D]; mul.lo.u32 %r34, %r3, %r19; add.u32 %r35, %r21, %r34; cvt.u64.u32 %rd19, %r35; mul.wide.u32 %rd20, %r35, 4; add.u64 %rd21, %rd18, %rd20; ld.global.f32 %f5, [%rd21+0]; st.volatile.shared.f32 [%rd11+0], %f5; bra.uni $Lt_1_11266; $Lt_1_11522: .loc 16 35 0 mov.f32 %f6, 0f00000000; // 0 st.volatile.shared.f32 [%rd11+0], %f6; $Lt_1_11266: .loc 16 37 0 bar.sync 0; .loc 16 24 0 mov.s64 %rd22, %rd9; mov.s64 %rd23, %rd13; mov.s32 %r36, 0; $Lt_1_12290: .pragma "nounroll"; // Loop body line 24, nesting depth: 2, iterations: 16 .loc 16 55 0 ld.volatile.shared.f32 %f7, [%rd22+0]; ld.volatile.shared.f32 %f8, [%rd23+0]; sub.f32 %f9, %f7, %f8; .loc 16 56 0 abs.f32 %f10, %f9; add.f32 %f2, %f2, %f10; .loc 16 57 0 ld.volatile.shared.f32 %f11, [%rd22+1088]; ld.volatile.shared.f32 %f12, [%rd23+0]; sub.f32 %f13, %f11, %f12; .loc 16 58 0 abs.f32 %f14, %f13; add.f32 %f1, %f1, %f14; add.s32 %r36, %r36, 1; add.u64 %rd23, %rd23, 4; add.u64 %rd22, %rd22, 4; mov.u32 %r37, 16; setp.ne.s32 %p10, %r36, %r37; @%p10 bra $Lt_1_12290; .loc 16 87 0 bar.sync 0; add.u32 %r19, %r19, 16; setp.lt.s32 %p11, %r19, %r20; @%p11 bra $Lt_1_10498; bra.uni $Lt_1_9986; $Lt_1_13570: add.u32 %r21, %r2, %r4; setp.gt.u32 %p2, %r3, %r21; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 $Lt_1_9986: .loc 16 91 0 @!%p2 bra $Lt_1_13058; .loc 16 95 0 mov.u32 %r38, %ctaid.x; mul.lo.u32 %r39, %r38, 32; mov.u32 %r40, %tid.y; add.u32 %r41, %r40, %r39; ld.param.u64 %rd24, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__DISTSD]; mul.lo.u32 %r42, %r3, %r41; add.u32 %r43, %r21, %r42; cvt.u64.u32 %rd25, %r43; mul.wide.u32 %rd26, %r43, 4; add.u64 %rd27, %rd24, %rd26; st.global.f32 [%rd27+0], %f2; .loc 16 97 0 add.u32 %r44, %r41, 16; mul.lo.u32 %r45, %r3, %r44; add.u32 %r46, %r21, %r45; cvt.u64.u32 %rd28, %r46; mul.wide.u32 %rd29, %r46, 4; add.u64 %rd30, %rd24, %rd29; st.global.f32 [%rd30+0], %f1; $Lt_1_13058: .loc 16 99 0 exit; $LDWend__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0_: } // _Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0_ .entry _Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0_ ( .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__F1D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__F2D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__DISTSD) { .reg .u32 %r<48>; .reg .u64 %rd<32>; .reg .f32 %f<14>; .reg .pred %p<13>; .shared .align 4 .b8 __cuda___cuda_local_var_45591_41_non_const_F2Ds8776[2176]; .shared .align 4 .b8 __cuda___cuda_local_var_45590_41_non_const_F1Ds10952[2176]; .loc 16 16 0 $LDWbegin__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0_: mov.u32 %r1, %ctaid.y; mul.lo.u32 %r2, %r1, 32; ld.const.s32 %r3, [n2C]; mov.u32 %r4, %tid.x; ld.const.s32 %r5, [descDimC]; mov.u32 %r6, 0; setp.le.s32 %p1, %r5, %r6; @%p1 bra $Lt_2_13826; mov.u64 %rd1, __cuda___cuda_local_var_45591_41_non_const_F2Ds8776; mov.u64 %rd2, __cuda___cuda_local_var_45590_41_non_const_F1Ds10952; mov.u32 %r7, %ctaid.x; mul.lo.u32 %r8, %r7, 32; ld.const.s32 %r9, [blockItC]; mul.lo.s32 %r10, %r9, 1024; add.s32 %r11, %r10, %r8; add.s32 %r12, %r5, 15; shr.s32 %r13, %r12, 31; mov.s32 %r14, 15; and.b32 %r15, %r13, %r14; add.s32 %r16, %r15, %r12; shr.s32 %r17, %r16, 4; mov.u32 %r18, %tid.y; mov.s32 %r19, %r18; add.u32 %r20, %r5, %r18; cvt.u64.u32 %rd3, %r18; add.u32 %r21, %r2, %r4; cvt.u64.u32 %rd4, %r4; mul.wide.u32 %rd5, %r4, 17; setp.gt.u32 %p2, %r3, %r21; mov.pred %p3, %p2; mov.pred %p4, %p5; add.u32 %r22, %r11, %r4; selp.s32 %r23, 1, 0, %p3; ld.const.s32 %r24, [n1C]; setp.gt.u32 %p6, %r24, %r22; add.u64 %rd6, %rd3, %rd5; mul.lo.u64 %rd7, %rd6, 4; selp.s32 %r25, 1, 0, %p6; mul.wide.u32 %rd8, %r18, 68; add.u64 %rd9, %rd2, %rd8; add.u64 %rd10, %rd7, %rd2; add.u64 %rd11, %rd7, %rd1; mul.lo.u64 %rd12, %rd5, 4; add.u64 %rd13, %rd1, %rd12; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 mov.s32 %r26, %r17; $Lt_2_10754: // Loop body line 16, nesting depth: 1, estimated iterations: unknown setp.gt.u32 %p7, %r5, %r19; selp.s32 %r27, 1, 0, %p7; and.b32 %r28, %r27, %r25; mov.u32 %r29, 0; setp.eq.s32 %p8, %r28, %r29; @%p8 bra $Lt_2_11266; .loc 16 26 0 ld.param.u64 %rd14, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__F1D]; mul.lo.u32 %r30, %r24, %r19; add.u32 %r31, %r22, %r30; cvt.u64.u32 %rd15, %r31; mul.wide.u32 %rd16, %r31, 4; add.u64 %rd17, %rd14, %rd16; ld.global.f32 %f3, [%rd17+0]; st.volatile.shared.f32 [%rd10+0], %f3; bra.uni $Lt_2_11010; $Lt_2_11266: .loc 16 28 0 mov.f32 %f4, 0f00000000; // 0 st.volatile.shared.f32 [%rd10+0], %f4; $Lt_2_11010: .loc 16 30 0 bar.sync 0; and.b32 %r32, %r27, %r23; mov.u32 %r33, 0; setp.eq.s32 %p9, %r32, %r33; @%p9 bra $Lt_2_11778; .loc 16 33 0 ld.param.u64 %rd18, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__F2D]; mul.lo.u32 %r34, %r3, %r19; add.u32 %r35, %r21, %r34; cvt.u64.u32 %rd19, %r35; mul.wide.u32 %rd20, %r35, 4; add.u64 %rd21, %rd18, %rd20; ld.global.f32 %f5, [%rd21+0]; st.volatile.shared.f32 [%rd11+0], %f5; bra.uni $Lt_2_11522; $Lt_2_11778: .loc 16 35 0 mov.f32 %f6, 0f00000000; // 0 st.volatile.shared.f32 [%rd11+0], %f6; $Lt_2_11522: .loc 16 37 0 bar.sync 0; .loc 16 24 0 mov.s64 %rd22, %rd9; mov.s64 %rd23, %rd13; mov.s32 %r36, 0; $Lt_2_12546: .pragma "nounroll"; // Loop body line 24, nesting depth: 2, iterations: 16 .loc 16 61 0 ld.volatile.shared.f32 %f7, [%rd22+0]; ld.volatile.shared.f32 %f8, [%rd23+0]; sub.f32 %f9, %f7, %f8; .loc 16 62 0 fma.rn.f32 %f2, %f9, %f9, %f2; .loc 16 63 0 ld.volatile.shared.f32 %f10, [%rd22+1088]; ld.volatile.shared.f32 %f11, [%rd23+0]; sub.f32 %f12, %f10, %f11; .loc 16 64 0 fma.rn.f32 %f1, %f12, %f12, %f1; add.s32 %r36, %r36, 1; add.u64 %rd23, %rd23, 4; add.u64 %rd22, %rd22, 4; mov.u32 %r37, 16; setp.ne.s32 %p10, %r36, %r37; @%p10 bra $Lt_2_12546; .loc 16 87 0 bar.sync 0; add.u32 %r19, %r19, 16; setp.lt.s32 %p11, %r19, %r20; @%p11 bra $Lt_2_10754; bra.uni $Lt_2_10242; $Lt_2_13826: add.u32 %r21, %r2, %r4; setp.gt.u32 %p2, %r3, %r21; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 $Lt_2_10242: .loc 16 91 0 @!%p2 bra $Lt_2_13314; .loc 16 95 0 mov.u32 %r38, %ctaid.x; mul.lo.u32 %r39, %r38, 32; mov.u32 %r40, %tid.y; add.u32 %r41, %r40, %r39; ld.param.u64 %rd24, [__cudaparm__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__DISTSD]; mul.lo.u32 %r42, %r3, %r41; add.u32 %r43, %r21, %r42; cvt.u64.u32 %rd25, %r43; mul.wide.u32 %rd26, %r43, 4; add.u64 %rd27, %rd24, %rd26; st.global.f32 [%rd27+0], %f2; .loc 16 97 0 add.u32 %r44, %r41, 16; mul.lo.u32 %r45, %r3, %r44; add.u32 %r46, %r21, %r45; cvt.u64.u32 %rd28, %r46; mul.wide.u32 %rd29, %r46, 4; add.u64 %rd30, %rd24, %rd29; st.global.f32 [%rd30+0], %f1; $Lt_2_13314: .loc 16 99 0 exit; $LDWend__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0_: } // _Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0_ .entry _Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0_ ( .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__F1D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__F2D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__DISTSD) { .reg .u32 %r<48>; .reg .u64 %rd<32>; .reg .f32 %f<24>; .reg .pred %p<15>; .shared .align 4 .b8 __cuda___cuda_local_var_45591_41_non_const_F2Ds13152[2176]; .shared .align 4 .b8 __cuda___cuda_local_var_45590_41_non_const_F1Ds15328[2176]; .loc 16 16 0 $LDWbegin__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0_: mov.u32 %r1, %ctaid.y; mul.lo.u32 %r2, %r1, 32; ld.const.s32 %r3, [n2C]; mov.u32 %r4, %tid.x; ld.const.s32 %r5, [descDimC]; mov.u32 %r6, 0; setp.le.s32 %p1, %r5, %r6; @%p1 bra $Lt_3_14338; mov.u64 %rd1, __cuda___cuda_local_var_45591_41_non_const_F2Ds13152; mov.u64 %rd2, __cuda___cuda_local_var_45590_41_non_const_F1Ds15328; mov.u32 %r7, %ctaid.x; mul.lo.u32 %r8, %r7, 32; ld.const.s32 %r9, [blockItC]; mul.lo.s32 %r10, %r9, 1024; add.s32 %r11, %r10, %r8; add.s32 %r12, %r5, 15; shr.s32 %r13, %r12, 31; mov.s32 %r14, 15; and.b32 %r15, %r13, %r14; add.s32 %r16, %r15, %r12; shr.s32 %r17, %r16, 4; mov.u32 %r18, %tid.y; mov.s32 %r19, %r18; add.u32 %r20, %r5, %r18; cvt.u64.u32 %rd3, %r18; add.u32 %r21, %r2, %r4; cvt.u64.u32 %rd4, %r4; mul.wide.u32 %rd5, %r4, 17; setp.gt.u32 %p2, %r3, %r21; mov.pred %p3, %p2; mov.pred %p4, %p5; add.u32 %r22, %r11, %r4; selp.s32 %r23, 1, 0, %p3; ld.const.s32 %r24, [n1C]; setp.gt.u32 %p6, %r24, %r22; add.u64 %rd6, %rd3, %rd5; mul.lo.u64 %rd7, %rd6, 4; selp.s32 %r25, 1, 0, %p6; mul.wide.u32 %rd8, %r18, 68; add.u64 %rd9, %rd2, %rd8; add.u64 %rd10, %rd7, %rd2; add.u64 %rd11, %rd7, %rd1; mul.lo.u64 %rd12, %rd5, 4; add.u64 %rd13, %rd1, %rd12; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 mov.s32 %r26, %r17; $Lt_3_10242: // Loop body line 16, nesting depth: 1, estimated iterations: unknown setp.gt.u32 %p7, %r5, %r19; selp.s32 %r27, 1, 0, %p7; and.b32 %r28, %r27, %r25; mov.u32 %r29, 0; setp.eq.s32 %p8, %r28, %r29; @%p8 bra $Lt_3_10754; .loc 16 26 0 ld.param.u64 %rd14, [__cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__F1D]; mul.lo.u32 %r30, %r24, %r19; add.u32 %r31, %r22, %r30; cvt.u64.u32 %rd15, %r31; mul.wide.u32 %rd16, %r31, 4; add.u64 %rd17, %rd14, %rd16; ld.global.f32 %f3, [%rd17+0]; st.volatile.shared.f32 [%rd10+0], %f3; bra.uni $Lt_3_10498; $Lt_3_10754: .loc 16 28 0 mov.f32 %f4, 0f00000000; // 0 st.volatile.shared.f32 [%rd10+0], %f4; $Lt_3_10498: .loc 16 30 0 bar.sync 0; and.b32 %r32, %r27, %r23; mov.u32 %r33, 0; setp.eq.s32 %p9, %r32, %r33; @%p9 bra $Lt_3_11266; .loc 16 33 0 ld.param.u64 %rd18, [__cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__F2D]; mul.lo.u32 %r34, %r3, %r19; add.u32 %r35, %r21, %r34; cvt.u64.u32 %rd19, %r35; mul.wide.u32 %rd20, %r35, 4; add.u64 %rd21, %rd18, %rd20; ld.global.f32 %f5, [%rd21+0]; st.volatile.shared.f32 [%rd11+0], %f5; bra.uni $Lt_3_11010; $Lt_3_11266: .loc 16 35 0 mov.f32 %f6, 0f00000000; // 0 st.volatile.shared.f32 [%rd11+0], %f6; $Lt_3_11010: .loc 16 37 0 bar.sync 0; .loc 16 24 0 mov.s64 %rd22, %rd9; mov.s64 %rd23, %rd13; mov.s32 %r36, 0; $Lt_3_12034: .pragma "nounroll"; // Loop body line 24, nesting depth: 2, iterations: 16 .loc 16 69 0 ld.volatile.shared.f32 %f7, [%rd22+0]; ld.volatile.shared.f32 %f8, [%rd23+0]; sub.f32 %f9, %f7, %f8; .loc 16 70 0 ld.volatile.shared.f32 %f10, [%rd22+0]; ld.volatile.shared.f32 %f11, [%rd23+0]; add.f32 %f12, %f10, %f11; mov.f32 %f13, 0f00000000; // 0 setp.gt.f32 %p10, %f12, %f13; @!%p10 bra $Lt_3_12290; .loc 16 72 0 mul.f32 %f14, %f9, %f9; div.rn.f32 %f15, %f14, %f12; add.f32 %f2, %f2, %f15; $Lt_3_12290: .loc 16 73 0 ld.volatile.shared.f32 %f16, [%rd22+1088]; ld.volatile.shared.f32 %f17, [%rd23+0]; sub.f32 %f9, %f16, %f17; .loc 16 74 0 ld.volatile.shared.f32 %f18, [%rd23+0]; ld.volatile.shared.f32 %f19, [%rd22+1088]; add.f32 %f12, %f18, %f19; mov.f32 %f20, 0f00000000; // 0 setp.gt.f32 %p11, %f12, %f20; @!%p11 bra $Lt_3_12802; .loc 16 76 0 mul.f32 %f21, %f9, %f9; div.rn.f32 %f22, %f21, %f12; add.f32 %f1, %f1, %f22; $Lt_3_12802: add.s32 %r36, %r36, 1; add.u64 %rd23, %rd23, 4; add.u64 %rd22, %rd22, 4; mov.u32 %r37, 16; setp.ne.s32 %p12, %r36, %r37; @%p12 bra $Lt_3_12034; .loc 16 87 0 bar.sync 0; add.u32 %r19, %r19, 16; setp.lt.s32 %p13, %r19, %r20; @%p13 bra $Lt_3_10242; bra.uni $Lt_3_9730; $Lt_3_14338: add.u32 %r21, %r2, %r4; setp.gt.u32 %p2, %r3, %r21; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 $Lt_3_9730: .loc 16 91 0 @!%p2 bra $Lt_3_13826; .loc 16 95 0 mov.u32 %r38, %ctaid.x; mul.lo.u32 %r39, %r38, 32; mov.u32 %r40, %tid.y; add.u32 %r41, %r40, %r39; ld.param.u64 %rd24, [__cudaparm__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__DISTSD]; mul.lo.u32 %r42, %r3, %r41; add.u32 %r43, %r21, %r42; cvt.u64.u32 %rd25, %r43; mul.wide.u32 %rd26, %r43, 4; add.u64 %rd27, %rd24, %rd26; st.global.f32 [%rd27+0], %f2; .loc 16 97 0 add.u32 %r44, %r41, 16; mul.lo.u32 %r45, %r3, %r44; add.u32 %r46, %r21, %r45; cvt.u64.u32 %rd28, %r46; mul.wide.u32 %rd29, %r46, 4; add.u64 %rd30, %rd24, %rd29; st.global.f32 [%rd30+0], %f1; $Lt_3_13826: .loc 16 99 0 exit; $LDWend__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0_: } // _Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0_ .entry _Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0_ ( .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__F1D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__F2D, .param .u64 __cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__DISTSD) { .reg .u32 %r<48>; .reg .u64 %rd<32>; .reg .f32 %f<13>; .reg .pred %p<13>; .shared .align 4 .b8 __cuda___cuda_local_var_45591_41_non_const_F2Ds17528[2176]; .shared .align 4 .b8 __cuda___cuda_local_var_45590_41_non_const_F1Ds19704[2176]; .loc 16 16 0 $LDWbegin__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0_: mov.u32 %r1, %ctaid.y; mul.lo.u32 %r2, %r1, 32; ld.const.s32 %r3, [n2C]; mov.u32 %r4, %tid.x; ld.const.s32 %r5, [descDimC]; mov.u32 %r6, 0; setp.le.s32 %p1, %r5, %r6; @%p1 bra $Lt_4_13314; mov.u64 %rd1, __cuda___cuda_local_var_45591_41_non_const_F2Ds17528; mov.u64 %rd2, __cuda___cuda_local_var_45590_41_non_const_F1Ds19704; mov.u32 %r7, %ctaid.x; mul.lo.u32 %r8, %r7, 32; ld.const.s32 %r9, [blockItC]; mul.lo.s32 %r10, %r9, 1024; add.s32 %r11, %r10, %r8; add.s32 %r12, %r5, 15; shr.s32 %r13, %r12, 31; mov.s32 %r14, 15; and.b32 %r15, %r13, %r14; add.s32 %r16, %r15, %r12; shr.s32 %r17, %r16, 4; mov.u32 %r18, %tid.y; mov.s32 %r19, %r18; add.u32 %r20, %r5, %r18; cvt.u64.u32 %rd3, %r18; add.u32 %r21, %r2, %r4; cvt.u64.u32 %rd4, %r4; mul.wide.u32 %rd5, %r4, 17; setp.gt.u32 %p2, %r3, %r21; mov.pred %p3, %p2; mov.pred %p4, %p5; add.u32 %r22, %r11, %r4; selp.s32 %r23, 1, 0, %p3; ld.const.s32 %r24, [n1C]; setp.gt.u32 %p6, %r24, %r22; add.u64 %rd6, %rd3, %rd5; mul.lo.u64 %rd7, %rd6, 4; selp.s32 %r25, 1, 0, %p6; mul.wide.u32 %rd8, %r18, 68; add.u64 %rd9, %rd2, %rd8; add.u64 %rd10, %rd7, %rd2; add.u64 %rd11, %rd7, %rd1; mul.lo.u64 %rd12, %rd5, 4; add.u64 %rd13, %rd1, %rd12; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 mov.s32 %r26, %r17; $Lt_4_10242: // Loop body line 16, nesting depth: 1, estimated iterations: unknown setp.gt.u32 %p7, %r5, %r19; selp.s32 %r27, 1, 0, %p7; and.b32 %r28, %r27, %r25; mov.u32 %r29, 0; setp.eq.s32 %p8, %r28, %r29; @%p8 bra $Lt_4_10754; .loc 16 26 0 ld.param.u64 %rd14, [__cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__F1D]; mul.lo.u32 %r30, %r24, %r19; add.u32 %r31, %r22, %r30; cvt.u64.u32 %rd15, %r31; mul.wide.u32 %rd16, %r31, 4; add.u64 %rd17, %rd14, %rd16; ld.global.f32 %f3, [%rd17+0]; st.volatile.shared.f32 [%rd10+0], %f3; bra.uni $Lt_4_10498; $Lt_4_10754: .loc 16 28 0 mov.f32 %f4, 0f00000000; // 0 st.volatile.shared.f32 [%rd10+0], %f4; $Lt_4_10498: .loc 16 30 0 bar.sync 0; and.b32 %r32, %r27, %r23; mov.u32 %r33, 0; setp.eq.s32 %p9, %r32, %r33; @%p9 bra $Lt_4_11266; .loc 16 33 0 ld.param.u64 %rd18, [__cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__F2D]; mul.lo.u32 %r34, %r3, %r19; add.u32 %r35, %r21, %r34; cvt.u64.u32 %rd19, %r35; mul.wide.u32 %rd20, %r35, 4; add.u64 %rd21, %rd18, %rd20; ld.global.f32 %f5, [%rd21+0]; st.volatile.shared.f32 [%rd11+0], %f5; bra.uni $Lt_4_11010; $Lt_4_11266: .loc 16 35 0 mov.f32 %f6, 0f00000000; // 0 st.volatile.shared.f32 [%rd11+0], %f6; $Lt_4_11010: .loc 16 37 0 bar.sync 0; .loc 16 24 0 mov.s64 %rd22, %rd9; mov.s64 %rd23, %rd13; mov.s32 %r36, 0; $Lt_4_12034: .pragma "nounroll"; // Loop body line 24, nesting depth: 2, iterations: 16 .loc 16 79 0 ld.volatile.shared.f32 %f7, [%rd22+0]; .loc 16 80 0 ld.volatile.shared.f32 %f8, [%rd23+0]; .loc 16 81 0 min.f32 %f9, %f7, %f8; add.f32 %f2, %f2, %f9; .loc 16 82 0 ld.volatile.shared.f32 %f10, [%rd22+1088]; .loc 16 83 0 min.f32 %f11, %f10, %f8; add.f32 %f1, %f1, %f11; add.s32 %r36, %r36, 1; add.u64 %rd23, %rd23, 4; add.u64 %rd22, %rd22, 4; mov.u32 %r37, 16; setp.ne.s32 %p10, %r36, %r37; @%p10 bra $Lt_4_12034; .loc 16 87 0 bar.sync 0; add.u32 %r19, %r19, 16; setp.lt.s32 %p11, %r19, %r20; @%p11 bra $Lt_4_10242; bra.uni $Lt_4_9730; $Lt_4_13314: add.u32 %r21, %r2, %r4; setp.gt.u32 %p2, %r3, %r21; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 $Lt_4_9730: .loc 16 91 0 @!%p2 bra $Lt_4_12802; .loc 16 95 0 mov.u32 %r38, %ctaid.x; mul.lo.u32 %r39, %r38, 32; mov.u32 %r40, %tid.y; add.u32 %r41, %r40, %r39; ld.param.u64 %rd24, [__cudaparm__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__DISTSD]; mul.lo.u32 %r42, %r3, %r41; add.u32 %r43, %r21, %r42; cvt.u64.u32 %rd25, %r43; mul.wide.u32 %rd26, %r43, 4; add.u64 %rd27, %rd24, %rd26; st.global.f32 [%rd27+0], %f2; .loc 16 97 0 add.u32 %r44, %r41, 16; mul.lo.u32 %r45, %r3, %r44; add.u32 %r46, %r21, %r45; cvt.u64.u32 %rd28, %r46; mul.wide.u32 %rd29, %r46, 4; add.u64 %rd30, %rd24, %rd29; st.global.f32 [%rd30+0], %f1; $Lt_4_12802: .loc 16 99 0 exit; $LDWend__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0_: } // _Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0_ _Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_dense_distance_gpu_large32x16.cu%s(%i) : cudaSafeCall() Runtime API error %d: %s. Use [Dists] = dense_distance_gpu_large(F1,F2,mode,param); (float) F: num x dim. Dists will be num2 x num1. mode can be 0: L_p distance(p determined by param), 1: Chi2 distance, 2: Histogram Intersection. param determines parameters for each method (pass empty if no param is required), it can be 0 for L_inf, or 1 or 2 for L_1 or L_2.Invalid dimensions. F1,F2 should have the same dimension.n1Cn2CblockItCdescDimCInvalid mode.p currently can be 0,1 or 2=:;X T|$W 4W$^\_``aabzPRxV  $0TiK0<^uD0TP_uD0l_uD0 `uD0`uD0V 4VBJBF CB(GA0CA8G}$\|NMN`LzRx `` 8 ho0 q  X P ooo ooo  x f v   & 6 F V f v CbFGCC: (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5.symtab.strtab.shstrtab.note.gnu.build-id.gnu.hash.dynsym.dynstr.gnu.version.gnu.version_d.gnu.version_r.rela.dyn.rela.plt.init.text.fini.rodata.eh_frame_hdr.eh_frame.ctors.dtors.jcr.dynamic.got.got.plt.data.nvFatBinSegment.bss.comment$.o(8 @00qHoBUo8do  psP} X 8 8 P P  xhh  \hh\H H` `p px x  P    P0+ J >0   8  P   h hH ` p x       H *` 8p E [ j x X p  0 i~  0       G ( 0 48   h   u h  uI U @z u@   u ux  8 %,5Oj|   !3M`k" |'3call_gmon_startcrtstuff.c__CTOR_LIST____DTOR_LIST____JCR_LIST____do_global_dtors_auxcompleted.7424dtor_idx.7426frame_dummy__CTOR_END____FRAME_END____JCR_END____do_global_ctors_auxtmpxft_00006296_00000000-1_dense_distance_gpu_large32x16.cudafe1.cppfatbinData_ZL101__sti____cudaRegisterAll_64_tmpxft_00006296_00000000_4_dense_distance_gpu_large32x16_cpp1_ii_1f785c06v_ZL15__fatDeviceText_ZL26__cudaUnregisterBinaryUtilv_ZL20__cudaFatCubinHandle_ZL3n1C_ZL3n2C_ZL8blockItC_ZL8descDimC_ZZL61__device_stub__Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0_PfS_S_E3__f_ZZL61__device_stub__Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0_PfS_S_E3__f_ZZL61__device_stub__Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0_PfS_S_E3__f_ZZL61__device_stub__Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0_PfS_S_E3__f_ZZL61__device_stub__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0_PfS_S_E3__fmexversion.cDW.ref.__gxx_personality_v0_fini_GLOBAL_OFFSET_TABLE_atexit_Z24ComputeDistancesFix32_16ILi0ELi2EEvPfS0_S0___dso_handle__DTOR_END___Z24ComputeDistancesFix32_16ILi1ELi0EEvPfS0_S0___bss_start_Z21DenseDistanceGpuLargePfiS_iiS_ii_Z24ComputeDistancesFix32_16ILi0ELi0EEvPfS0_S0__end_edata_Z24ComputeDistancesFix32_16ILi0ELi1EEvPfS0_S0__Z24ComputeDistancesFix32_16ILi2ELi0EEvPfS0_S0__DYNAMIC_initmxGetNcudaFreemxCreateNumericMatrix_700__fprintf_chk@@GLIBC_2.3.4__cudaRegisterVar__gmon_start___Jv_RegisterClassesexit@@GLIBC_2.2.5__cudaRegisterFunctionMEXcudaMemcpyToSymbol__cxa_atexit@@GLIBC_2.2.5cudaSetupArgumentcudaThreadSynchronizecudaConfigureCall__cudaUnregisterFatBinarycudaGetErrorStringcudaLaunch__cxa_finalize@@GLIBC_2.2.5__cudaRegisterFatBinaryceilf@@GLIBC_2.2.5mxGetMmexFunctionstderr@@GLIBC_2.2.5mxGetDatamexErrMsgTxtcudaMemcpycudaMalloc__gxx_personality_v0@@CXXABI_1.3mxGetScalarmxIsSingle