ELF>@س@8@ hh h 8   $$PtdQtdRtdhh h GNU@;62'PQR8@ 8 5 "*"1^b  "VK""{&tD""8"l3"#WK""pe""" 8 __gmon_start____cxa_finalize_Jv_RegisterClasses__cudaRegisterFatBinary__cudaRegisterFunction__cudaRegisterVar__cudaUnregisterFatBinarycudaSetupArgumentcudaLaunchsqrtlog1pexp__isnanlog__isinffloorsinremquocoslog2exp2ceilceilfcudaThreadSynchronizecudaMemcpyToSymbolAsynccudaConfigureCallcublasSscalcudaGetErrorStringstderr__fprintf_chkcudaMemGetInfomexPrintfcudaMalloccudaFreecudaMemcpyAsync_ZnamcudaMemcpyToSymbolcudaMemcpy_ZdaPvdifftimemexFunctionmexEvalStringmexErrMsgTxtcudaGetSymbolAddressmxGetDatamxGetMmxGetNmxCreateNumericMatrix_700cublasInitcublasShutdownmxIsSingle__gxx_personality_v0libcudart.so.3libcublas.so.3libmx.solibmex.solibmat.solibm.so.6libstdc++.so.6libpthread.so.0libc.so.6__cxa_atexitMDKDE_Isotropic.mexa64MEXCXXABI_1.3GLIBCXX_3.4GLIBC_2.2.5GLIBC_2.3.4D}Q0ӯkt)&0ui 2ti >ui 2p @Я  د   ȱ `e  :  8 @ H P  X  `     ȯ  ) 1     ( 0 8  @  H  P X ` h p x          Ȱ а ذ   ! " # $ % & ' (( *0 +8 ,@ -H .P /X 0` 2h 3p 4x 5 6 7H?'H5* %, @%* h%" h% h% h% h% h% h% hp% h`% h P%ڜ h @%Ҝ h 0%ʜ h %œ h % h% h% h% h% h% h% h% h%z h%r hp%j h`%b hP%Z h@%R h0%J h %B h%: h%2 h%* h %" h!% h"% h#% h$% h%% h&% h'p% h(`% h)P%ڛ h*@%қ h+0%ʛ h, %› h-% h.% h/% h0% h1HH HtHÐU= HATSubH= t H=o *HK L%< H] L)HHH9s HH= AH2 H9r [A\UH= HtH HtH=ߖ Ð9N9FH9HHNH9HHF9M9CH9HHMH9HHCH=ٚ H(`H=9Hb "H VH5_ H=H E1AHD$HHD$HD$H$H ݆H= H5 AE1D$H$OH H=њ H5֚ AE1D$H$H |H= H5 AE1D$H$H KH=g H5 AE1D$H$H( H=) H(H|$H|$Ht$HT$1YXXYX-*YXրYX-YXƀYX- YXYX-YXYX-YXYX-ڀYXYX-ʀYXvYX-YXfYX-YXYH(^Ðf(H(Xf(w=fWQf.zt :f(D DD^oEXAYEYX`DXAYEYXMDXAYEYX:DXqAYEYX'DX^AYEYXDXKAYEYXDX8AYEYXDX%AYEYX#DXH(EYA^P~ $@$f(o~\Y fX~H(\ HZfHfZSf(H $m$utf.~ztXf($j$uAf($3$f.uHzF 6H f([^^^f(7H [Ðf(H [XfH|$ f(\$YutfW몐HZfHfZf(H( $^ $u5f. }ztf( $[ $t"|^H(Ðf(H(X%h $fTf(d$ YC%Kt$f(-fWYYY$f„YXB|YXnfWe HZfHfZA*AWAVAAUATIUSHXDD$,LL$ZYm~$Ht$ D$@ D$DD$H,}H*Љ $Y-~ffZ^(Q,l$0D$8D$4Xk޿ Ll$LH=zE11ALD{D$LAHT$@L$HAH|$0t$8E1E1McAI CM Et@޿@mM9 މIH=zE11ALD$LHT$@L$HE1H|$0t$8E1i MމH=yE11ALD$LHT$@L$HE1H|$0t$8E1JHT$H|$ LM98*Ht$||$,^*HX[]A\A]A^A_HT$H|$ L@HT$H|$ LH; H xH5uIAj1H;U AWAVAUAATULSHhH|$H|$XHt$T$DD$Ht$0H|$(HD$HHD$@HD$8\$Dd$\ECH4ׂCDAAAA߉\$EEC'AHD$(AHHD$0D)HHH1HHH=tH,LD$(HcL9wWMcH|$HLH=(u1H|$HH|$@H|$8Hh[]A\A]A^A_H=ytDD1Hh[]A\A]A^A_H=s1Hh[]A\A]A^A_Hc\$H|$@H_McH|$8LGHt$H|$HE1LHt$H|$@E1HڹH$H|$8E1LCT$HÅ  LuL9 DEQWۍ~f( y9D((f(yDEf(fD(DEAAZf(ZfD(fYfDYfYfDYf^fE^fDZfEZED)CtgD(fD( D}fD(D}AEZfE(DZfE(fDYfDYfEYfEYfE^fE^fAZfAZ){D(fD(D4(f(Dt(AEZfE(DZfE(fDYfDYfEYfEYfE^fA^fAZfZ)4HD(fD(D,(f(Dl(AEZfE(DZfA(fDYfYfEYfAYfE^f^fEZfZDD)<H9w(fD(4(fD((t(DZfE(DZfE(fDYfDYfEYfEYfE^fD(fE^fD(fAZfAZ()$llZfD(DZfE(fDYfDYfDYfEYfE^fD(fE^fD(fAZfEZA)d(| |(Zf(DZfE(fYfDYfYfEYfD^f(fE^fD(fEZfEZED)D d0d8Zf(DZfE(fYfDYfYfEYf^fE^fDZfEZED)L0H@9D9Mc$vIDApLf(}N HMZf(9IAXY^f(ffZA TtBD)fD(ApHMEZfE(EXEYE^fE(fEfEZDIAf(HZf(XY^fD(fEfEZD0HDf(HEZfE(EXEYA^f(ffZ(H1fD(HDZfE(EXEYE^fE(fEfEZDH!f(ЃHZf(XY^f(ffZHD)fD(ЃHEZfE(EXEYE^fE(fEfEZDHf(HZf(XY^fD(fEfEZD0H9Df(f(EZfE(fD(EXEYA^f(ffZ(QZf(f(XY^fD(fEfEZDpDiEZfE(fD(EXEYE^fE(fEfEZD@y Zf(f(XY^f(ffZH qDZfE(f(EXEYE^fE(fEfEZDXDQEZfE(fD(EXEYA^f(ffZhQZf(XY^fD(fEfEZDpDiH EZfE(EXEYE^fE(fEfEZD@H 9S$LcAJ,AAHHރD9AG ~EHEt|tgtRt=t'tHEHHHHHH9vE@@@ @@@@H 9wA9A)DD EAWwJl9HE)Etot\tMt:t+tt )HE )H)H)H)H)H)H9s*))@)@ )@0)@@)@P)@`)@pH9rDE9Hc֍JH<փHGtmtVtDt2tt JHGHHHHHHF@@@ @@@@H ~H= 1AHHt$H=kE11AHt$H=kE11ArLL$8DD$DT$Ht$@H|$HHt$8H$LH|$HH|$@wH|$8mHtHPH|$PFH|$PHt$XgfH=hfZZ(Hh[]A\A]A^A_LxI9FumJZf(f(H4XH9Y^f(ffZ \HHH;HHHfttHt<ufD(DZfE(EXEYE^fE(fEfEZD[df(Zf(XY^f(ffZ HDlfD(EZfE(EXEYE^fE(fEfEZDHTf(Zf(XY^fD(fEfEZD4HDTf(EZfE(EXEYA^f(ffZ,HtfD(DZfE(EXEYE^fE(fEfEZDHdf(Zf(XY^f(ffZ HH9DLf(fD(EZfA(fD(AXAY^f(ffZ$TZf(f(XYD^fE(fEfEZDlDdEZfE(fD(EXEYE^fE(fEfAZ|l Zf(f(XY^f(ffZL D|EZfE(fD(EXEYE^fE(fEfEZDTDLEZfA(fD(AXAY^f(ffZdTZf(XYD^fE(fEfEZDlDdEZfE(EXEYE^fE(fEfAZ|H H9GsH x H fHcIA1H;&^ HAHl$Ll$Lt$H\$ILd$L|$HxHLiHD$H=_ffH=dH|$0H5f99H}I}IH}HD$(^H}3I}AGI}HD$ HT$HH:);\$ ItH=eH=%eXD9HL$H9D1ɺIHIIH|$(MELL4$ H=IePH\$HHl$PLd$XLl$`Lt$hL|$pHxH=eH=dXH=dH=dmH9XH}GH}N1Hu Ht H11UHSHH s HtHs HHHuH[ÐHHELF3)@@8@ ) )3\ 6 8L]T\1 ).shstrtab.strtab.symtab.nv.global.init.nv.global.text._Z22MDKDE_Isotropic_K32x16PfS_S_.nv.info._Z22MDKDE_Isotropic_K32x16PfS_S_.nv.info.nv.shared._Z22MDKDE_Isotropic_K32x16PfS_S_.nv.constant16._Z22MDKDE_Isotropic_K32x16PfS_S_.nv.constant0._Z22MDKDE_Isotropic_K32x16PfS_S_.nv.constant2_Z22MDKDE_Isotropic_K32x16PfS_S_N2CBWCDimCN1C  " & * / ]D(\,,#HQP(@H`(Ʌ! ;@,,\CPQ@H P@ P4H#s H @3`rHP @H#Q HHBH(\RHP#%H!HP#7H2HP`)@H)H@ H@(0 C0PC0@H (@ܰ`-@H)H@ H@(0 C0PC0@H (@ܰ!`!@H)H@ $H@(0 C0PC0@H (@`%@H)H@ (H@(0 C0PC0@H (@! ܟ(~ P```\$4B0H(APAX$A&0Ʌ4c]MP\EX$E&0Ʌ\$4BMP@H(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$ 4 BMPPH(EQX4Q 0Ʌ5 c]QP\EX4E$0Ʌ\$040BMP`H(EQX4Q 0Ʌ50c]QP\EX4E$0Ʌ\$@4@BMPpH(EQX4Q 0Ʌ5@c]QP\EX4E$0Ʌ\$P4PBMPH(EQX4Q 0Ʌ5Pc]QP\EX4E$0Ʌ\$`4`BMPH(EQX4Q 0Ʌ5`c]QP\EX4E$0Ʌ\$p4pBMPH(EQX4Q 0Ʌ5pc]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$Є4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMPH(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4BMP H(EQX4Q 0Ʌ5c]QP\EX4E$0Ʌ\$4CMP0H(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMP@H(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$ 4 CMPPH(EQX4Q 0Ʌ5 d]QP\EX4E$0Ʌ\$040CMP`H(EQX4Q 0Ʌ50d]QP\EX4E$0Ʌ\$@4@CMPpH(EQX4Q 0Ʌ5@d]QP\EX4E$0Ʌ\$P4PCMPH(EQX4Q 0Ʌ5Pd]QP\EX4E$0Ʌ\$`4`CMPH(EQX4Q 0Ʌ5`d]QP\EX4E$0Ʌ\$p4pCMPH(EQX4Q 0Ʌ5pd]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$Ѕ4CMPH(EQX4Q 0Ʌ5d]QP\EX4E$0Ʌ\$4CMPH(EQX4Q 0Ʌ5d]QP\EX\4E$0\Ʌ$4C H($MP4MX$L 0Ʌ0d P4 X @"0PH# H@! 00# !"!0@05 q@00@0y r@ 0!00X0g qX `܋ aX `!X `1 ` qX0 XaX0 XpX ` X"   ~ Ʌ!PQ,,P@ @#HAP@(P@@\A 0P@0\Q P@@\!P@P\1P@`\!P@p1P\A@ PA\QP 0P\ACp@HaPA QP\AaPAQP\AaPAQP\AaPA QP\A0aPA@QP\APaPA`QP\ApaPAQP\AaPAQP\AaPAQP\AaPAQP\AaPQPP r1    ! ! ! $  )` ` 1   .version 2.2 .target sm_20 // compiled with /usr/local/cuda/open64/lib//be // nvopencc 3.2 built on 2010-09-08 //----------------------------------------------------------- // Compiling /tmp/tmpxft_0000173b_00000000-7_MDKDE_Isotropic.cpp3.i (/tmp/ccBI#.KzVq12) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "" .file 2 "/tmp/tmpxft_0000173b_00000000-6_MDKDE_Isotropic.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.5/include/stddef.h" .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin/../include/host_defines.h" .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" .file 7 "/usr/local/cuda/bin/../include/device_types.h" .file 8 "/usr/local/cuda/bin/../include/driver_types.h" .file 9 "/usr/local/cuda/bin/../include/surface_types.h" .file 10 "/usr/local/cuda/bin/../include/texture_types.h" .file 11 "/usr/local/cuda/bin/../include/vector_types.h" .file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h" .file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "MDKDE_Isotropic.cu" .file 17 "/usr/local/cuda/bin/../include/common_functions.h" .file 18 "/usr/local/cuda/bin/../include/math_functions.h" .file 19 "/usr/local/cuda/bin/../include/math_constants.h" .file 20 "/usr/local/cuda/bin/../include/device_functions.h" .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" .file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" .file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" .file 26 "/usr/local/cuda/bin/../include/surface_functions.h" .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx3.h" .const .s32 DimC; .const .s32 N1C; .const .s32 N2C; .const .align 4 .b8 BWC[4096]; .entry _Z22MDKDE_Isotropic_K32x16PfS_S_ ( .param .u64 __cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__AD, .param .u64 __cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__BD, .param .u64 __cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__DD) { .reg .u32 %r<45>; .reg .u64 %rd<45>; .reg .f32 %f<120>; .reg .pred %p<19>; .shared .align 4 .b8 __cuda___cuda_local_var_66639_32_non_const_ResDs4120[4224]; .shared .align 4 .b8 __cuda___cuda_local_var_66638_32_non_const_BDs8344[4224]; .shared .align 4 .b8 __cuda___cuda_local_var_66637_32_non_const_ADs12568[4224]; // __cuda_local_var_66642_8_non_const_temp = 32 .loc 16 26 0 $LDWbegin__Z22MDKDE_Isotropic_K32x16PfS_S_: mov.u64 %rd1, __cuda___cuda_local_var_66639_32_non_const_ResDs4120; .loc 16 34 0 mov.u32 %r1, %tid.x; cvt.u64.u32 %rd2, %r1; mov.u32 %r2, %tid.y; cvt.u64.u32 %rd3, %r2; mul.wide.u32 %rd4, %r2, 33; add.u64 %rd5, %rd2, %rd4; mul.lo.u64 %rd6, %rd5, 4; add.u64 %rd7, %rd6, %rd1; mov.f32 %f1, 0f00000000; // 0 mov.f32 %f2, 0f00000000; // 0 st.shared.f32 [%rd7+0], %f2; .loc 16 35 0 mov.f32 %f3, 0f00000000; // 0 mov.f32 %f4, 0f00000000; // 0 st.shared.f32 [%rd7+2112], %f4; ld.const.s32 %r3, [DimC]; mov.u32 %r4, 0; setp.le.s32 %p1, %r3, %r4; @%p1 bra $Lt_0_13058; mov.u64 %rd8, __cuda___cuda_local_var_66638_32_non_const_BDs8344; mov.u64 %rd9, __cuda___cuda_local_var_66637_32_non_const_ADs12568; add.s32 %r5, %r3, 31; shr.s32 %r6, %r5, 31; mov.s32 %r7, 31; and.b32 %r8, %r6, %r7; add.s32 %r9, %r8, %r5; shr.s32 %r10, %r9, 5; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.y; mul.lo.u32 %r13, %r11, %r12; ld.const.s32 %r14, [N2C]; setp.gt.s32 %p2, %r14, %r2; add.s32 %r15, %r2, 16; mul.lo.u64 %rd10, %rd2, 33; mov.u32 %r16, %ctaid.x; add.u32 %r17, %r16, %r13; setp.lt.s32 %p3, %r15, %r14; mul.lo.u32 %r18, %r17, 32; add.u32 %r19, %r18, %r2; add.u64 %rd11, %rd3, %rd10; mul.lo.u64 %rd12, %rd11, 4; add.u64 %rd13, %rd1, %rd12; add.u64 %rd14, %rd6, %rd9; add.u64 %rd15, %rd6, %rd8; ld.const.s32 %r20, [N1C]; setp.gt.s32 %p4, %r20, %r19; add.s32 %r21, %r19, 16; setp.lt.s32 %p5, %r21, %r20; mov.s32 %r22, 0; mov.u64 %rd16, BWC; mov.s32 %r23, %r10; $Lt_0_13570: // Loop body line 35, nesting depth: 1, estimated iterations: unknown @!%p4 bra $Lt_0_14082; // Part of loop body line 35, head labeled $Lt_0_13570 add.u32 %r24, %r22, %r1; setp.le.u32 %p6, %r3, %r24; @%p6 bra $Lt_0_14594; // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 44 0 ld.param.u64 %rd17, [__cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__AD]; mul.lo.s32 %r25, %r3, %r19; add.u32 %r26, %r24, %r25; cvt.u64.u32 %rd18, %r26; mul.wide.u32 %rd19, %r26, 4; add.u64 %rd20, %rd17, %rd19; ld.global.f32 %f5, [%rd20+0]; bra.uni $Lt_0_13826; $Lt_0_14594: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 46 0 mov.f32 %f5, 0f00000000; // 0 bra.uni $Lt_0_13826; $Lt_0_14082: // Part of loop body line 35, head labeled $Lt_0_13570 mov.f32 %f5, %f6; $Lt_0_13826: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 48 0 st.shared.f32 [%rd14+0], %f5; @!%p5 bra $Lt_0_15362; // Part of loop body line 35, head labeled $Lt_0_13570 add.u32 %r24, %r22, %r1; setp.le.u32 %p7, %r3, %r24; @%p7 bra $Lt_0_15618; // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 44 0 ld.param.u64 %rd21, [__cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__AD]; mul.lo.s32 %r27, %r21, %r3; add.u32 %r28, %r24, %r27; cvt.u64.u32 %rd22, %r28; mul.wide.u32 %rd23, %r28, 4; add.u64 %rd24, %rd21, %rd23; ld.global.f32 %f5, [%rd24+0]; bra.uni $Lt_0_15362; $Lt_0_15618: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 46 0 mov.f32 %f5, 0f00000000; // 0 $Lt_0_15362: $Lt_0_14850: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 48 0 st.shared.f32 [%rd14+2112], %f5; @!%p2 bra $Lt_0_16386; // Part of loop body line 35, head labeled $Lt_0_13570 add.u32 %r24, %r22, %r1; setp.le.u32 %p8, %r3, %r24; @%p8 bra $Lt_0_16642; // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 57 0 ld.param.u64 %rd25, [__cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__BD]; mul.lo.s32 %r29, %r3, %r2; add.u32 %r30, %r24, %r29; cvt.u64.u32 %rd26, %r30; mul.wide.u32 %rd27, %r30, 4; add.u64 %rd28, %rd25, %rd27; ld.global.f32 %f5, [%rd28+0]; bra.uni $Lt_0_16386; $Lt_0_16642: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 59 0 mov.f32 %f5, 0f00000000; // 0 $Lt_0_16386: $Lt_0_15874: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 61 0 st.shared.f32 [%rd15+0], %f5; @!%p3 bra $Lt_0_17410; // Part of loop body line 35, head labeled $Lt_0_13570 add.u32 %r24, %r22, %r1; setp.le.u32 %p9, %r3, %r24; @%p9 bra $Lt_0_17666; // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 57 0 ld.param.u64 %rd29, [__cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__BD]; mul.lo.s32 %r31, %r15, %r3; add.u32 %r32, %r24, %r31; cvt.u64.u32 %rd30, %r32; mul.wide.u32 %rd31, %r32, 4; add.u64 %rd32, %rd29, %rd31; ld.global.f32 %f5, [%rd32+0]; bra.uni $Lt_0_17410; $Lt_0_17666: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 59 0 mov.f32 %f5, 0f00000000; // 0 $Lt_0_17410: $Lt_0_16898: // Part of loop body line 35, head labeled $Lt_0_13570 .loc 16 61 0 st.shared.f32 [%rd15+2112], %f5; .loc 16 63 0 bar.sync 0; mov.s64 %rd33, %rd16; mul.lo.u64 %rd34, %rd10, 4; add.u64 %rd35, %rd9, %rd34; mul.lo.u64 %rd36, %rd4, 4; add.u64 %rd37, %rd8, %rd36; ld.shared.f32 %f7, [%rd13+0]; ld.shared.f32 %f8, [%rd13+64]; mov.s32 %r33, 0; $Lt_0_18434: // Loop body line 63, nesting depth: 2, iterations: 32 .loc 16 68 0 ld.const.f32 %f9, [%rd33+0]; ld.shared.f32 %f10, [%rd35+0]; ld.shared.f32 %f11, [%rd37+0]; sub.f32 %f12, %f10, %f11; mul.f32 %f13, %f12, %f12; fma.rn.f32 %f7, %f9, %f13, %f7; st.shared.f32 [%rd13+0], %f7; .loc 16 70 0 ld.shared.f32 %f14, [%rd37+2112]; sub.f32 %f15, %f10, %f14; mov.f32 %f5, %f15; .loc 16 71 0 mul.f32 %f16, %f15, %f15; fma.rn.f32 %f8, %f9, %f16, %f8; st.shared.f32 [%rd13+64], %f8; add.s32 %r33, %r33, 1; add.u64 %rd37, %rd37, 4; add.u64 %rd35, %rd35, 4; add.u64 %rd33, %rd33, 4; mov.u32 %r34, 32; setp.ne.s32 %p10, %r33, %r34; @%p10 bra $Lt_0_18434; // Part of loop body line 35, head labeled $Lt_0_13570 mov.f32 %f6, %f5; .loc 16 73 0 bar.sync 0; add.s32 %r22, %r22, 32; setp.gt.s32 %p11, %r3, %r22; @%p11 bra $Lt_0_13570; ld.shared.f32 %f1, [%rd7+0]; ld.shared.f32 %f3, [%rd7+2112]; $Lt_0_13058: .loc 16 76 0 mov.f32 %f17, %f1; .loc 18 1503 0 mov.f32 %f18, 0f3fb8aa3b; // 1.4427 mul.f32 %f19, %f17, %f18; cvt.rzi.f32.f32 %f20, %f19; mov.f32 %f21, 0f7f800000; // ((1.0F)/(0.0F)) mov.f32 %f22, 0f00000000; // 0 ex2.approx.f32 %f23, %f20; mov.f32 %f24, 0fb5bfbe8e; // -1.42861e-06 mov.f32 %f25, 0fbf317200; // -0.693146 fma.rn.f32 %f26, %f20, %f25, %f17; fma.rn.f32 %f27, %f20, %f24, %f26; mov.f32 %f28, 0f3fb8aa3b; // 1.4427 mul.f32 %f29, %f27, %f28; ex2.approx.f32 %f30, %f29; mul.f32 %f31, %f23, %f30; mov.f32 %f32, 0fc2d20000; // -105 setp.lt.f32 %p12, %f17, %f32; selp.f32 %f33, %f22, %f31, %p12; mov.f32 %f34, 0f42d20000; // 105 setp.gt.f32 %p13, %f17, %f34; selp.f32 %f35, %f21, %f33, %p13; .loc 16 76 0 st.shared.f32 [%rd7+0], %f35; .loc 16 77 0 mov.f32 %f36, %f3; .loc 18 1503 0 mov.f32 %f37, 0f3fb8aa3b; // 1.4427 mul.f32 %f38, %f36, %f37; cvt.rzi.f32.f32 %f39, %f38; mov.f32 %f40, 0f7f800000; // ((1.0F)/(0.0F)) mov.f32 %f41, 0f00000000; // 0 ex2.approx.f32 %f42, %f39; mov.f32 %f43, 0fb5bfbe8e; // -1.42861e-06 mov.f32 %f44, 0fbf317200; // -0.693146 fma.rn.f32 %f45, %f39, %f44, %f36; fma.rn.f32 %f46, %f39, %f43, %f45; mov.f32 %f47, 0f3fb8aa3b; // 1.4427 mul.f32 %f48, %f46, %f47; ex2.approx.f32 %f49, %f48; mul.f32 %f50, %f42, %f49; mov.f32 %f51, 0fc2d20000; // -105 setp.lt.f32 %p14, %f36, %f51; selp.f32 %f52, %f41, %f50, %p14; mov.f32 %f53, 0f42d20000; // 105 setp.gt.f32 %p15, %f36, %f53; selp.f32 %f54, %f40, %f52, %p15; .loc 16 77 0 st.shared.f32 [%rd7+2112], %f54; .loc 16 78 0 bar.sync 0; mov.u32 %r35, 0; setp.ne.u32 %p16, %r2, %r35; @%p16 bra $Lt_0_19714; mov.u32 %r36, %nctaid.x; mov.u32 %r37, %ctaid.y; mul.lo.u32 %r38, %r36, %r37; mov.u32 %r39, %ctaid.x; add.u32 %r40, %r39, %r38; mul.lo.u32 %r41, %r40, 32; add.u32 %r42, %r41, %r1; ld.const.s32 %r43, [N1C]; setp.le.s32 %p17, %r43, %r42; @%p17 bra $Lt_0_19714; .loc 16 86 0 mul.lo.u64 %rd38, %rd2, 132; add.u64 %rd39, %rd1, %rd38; ld.shared.f32 %f55, [%rd39+32]; ld.shared.f32 %f56, [%rd39+28]; ld.shared.f32 %f57, [%rd39+24]; ld.shared.f32 %f58, [%rd39+20]; ld.shared.f32 %f59, [%rd39+16]; ld.shared.f32 %f60, [%rd39+12]; ld.shared.f32 %f61, [%rd39+8]; ld.shared.f32 %f62, [%rd39+0]; ld.shared.f32 %f63, [%rd39+4]; add.f32 %f64, %f62, %f63; add.f32 %f65, %f61, %f64; add.f32 %f66, %f60, %f65; add.f32 %f67, %f59, %f66; add.f32 %f68, %f58, %f67; add.f32 %f69, %f57, %f68; add.f32 %f70, %f56, %f69; add.f32 %f71, %f55, %f70; ld.shared.f32 %f72, [%rd39+36]; add.f32 %f73, %f72, %f71; ld.shared.f32 %f74, [%rd39+40]; add.f32 %f75, %f74, %f73; ld.shared.f32 %f76, [%rd39+44]; add.f32 %f77, %f76, %f75; ld.shared.f32 %f78, [%rd39+48]; add.f32 %f79, %f78, %f77; ld.shared.f32 %f80, [%rd39+52]; add.f32 %f81, %f80, %f79; ld.shared.f32 %f82, [%rd39+56]; add.f32 %f83, %f82, %f81; ld.shared.f32 %f84, [%rd39+60]; add.f32 %f85, %f84, %f83; ld.shared.f32 %f86, [%rd39+64]; add.f32 %f87, %f86, %f85; ld.shared.f32 %f88, [%rd39+68]; add.f32 %f89, %f88, %f87; ld.shared.f32 %f90, [%rd39+72]; add.f32 %f91, %f90, %f89; ld.shared.f32 %f92, [%rd39+76]; add.f32 %f93, %f92, %f91; ld.shared.f32 %f94, [%rd39+80]; add.f32 %f95, %f94, %f93; ld.shared.f32 %f96, [%rd39+84]; add.f32 %f97, %f96, %f95; ld.shared.f32 %f98, [%rd39+88]; add.f32 %f99, %f98, %f97; ld.shared.f32 %f100, [%rd39+92]; add.f32 %f101, %f100, %f99; ld.shared.f32 %f102, [%rd39+96]; add.f32 %f103, %f102, %f101; ld.shared.f32 %f104, [%rd39+100]; add.f32 %f105, %f104, %f103; ld.shared.f32 %f106, [%rd39+104]; add.f32 %f107, %f106, %f105; ld.shared.f32 %f108, [%rd39+108]; add.f32 %f109, %f108, %f107; ld.shared.f32 %f110, [%rd39+112]; add.f32 %f111, %f110, %f109; ld.shared.f32 %f112, [%rd39+116]; add.f32 %f113, %f112, %f111; ld.shared.f32 %f114, [%rd39+120]; add.f32 %f115, %f114, %f113; ld.shared.f32 %f116, [%rd39+124]; add.f32 %f5, %f116, %f115; .loc 16 87 0 ld.param.u64 %rd40, [__cudaparm__Z22MDKDE_Isotropic_K32x16PfS_S__DD]; cvt.s64.s32 %rd41, %r42; mul.wide.s32 %rd42, %r42, 4; add.u64 %rd43, %rd40, %rd42; ld.global.f32 %f117, [%rd43+0]; add.f32 %f118, %f117, %f5; st.global.f32 [%rd43+0], %f118; $Lt_0_19714: $Lt_0_19202: .loc 16 91 0 exit; $LDWend__Z22MDKDE_Isotropic_K32x16PfS_S_: } // _Z22MDKDE_Isotropic_K32x16PfS_S_ _Z22MDKDE_Isotropic_K32x16PfS_S_%s(%i) : cudaSafeCall() Runtime API error : %s. Error querying the device for free memory! Total memory on the device: %d MB, Memory available: %d MB, Required memory: %d MB Error, the data does not fit in GPU memory: requires %d bytes(%d essential, %d temp), %d bytes available Error occured allocating memory MDKDE_Isotropic finished, total time: %f seconds Use [P] = MDKDE_Isotropic(Y,X,Sigma); (Float) Where X(dim x num_samples) holds the sampled points, Y(dim x num_eval) holds the points on which the densities are to be evaluated, Sigma holds the Isotropic bandwidth for each dimension. For homogeneous KDE pass Sigma with equal bandwidths for each dimension. Written by Omid Aghazadeh: This code is free to use for ACADEMIC RESEARCH and NOT FOR COMMERCIAL USE. The dimensions of X,Y do not match.Sigma should be a (dimx1) vector. Dimension cannot be more than MAX_DIMENSION.DimCN1CN2CBWCMDKDE_Isotropic.cuclear MDKDE_IsotropicProblem with constant memory!d433f7dcd97d064d-v compute_20sm_20?@@?Ne 7f?/镇?k;'(@v`G @ӟ @8_~V?`5BL?rsVw?]Yuq?Y0%*"?xbI@ T*@>>? ?~?Z@U =@ԆBzG@PgA@CČ)@<G@gFǰ?Zo?!R$)@OzD@o[J@@@v&@K8K@@i?|t5{??h7y|@;Q~@β$7@GE^@bܲ@M=s@UsT@: g7N@=Bi@7O@{Q)@@׽q@?0.++ b?">R6?$8?h0߼?;[Fx?y!?+3>p3>8-e3>2NB[=c[=?-DT! @UUUUUUտUUUUUU??@@@@?;ttt,tDt\tt uu,u|vv wwLw4lwLL{dl{|~~܀̃L zPRx  $s<sTs ls ssxs ps hsEK0t tuD0,uDt*D\uGtuDPxGxEH0 |G(|AH0E|G }H0$}G4<}GBCB B(CA0A8BG4tx"BBB CB(A0CA8DX$p [NUzRx @@  x:o J     oH oo oo o  &6FVfv&6FVfv&6FVfv `e : *ZQO e GCC: (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5.symtab.strtab.shstrtab.note.gnu.build-id.gnu.hash.dynsym.dynstr.gnu.version.gnu.version_d.gnu.version_r.rela.dyn.rela.plt.init.text.fini.rodata.eh_frame_hdr.eh_frame.ctors.dtors.jcr.dynamic.got.got.plt.data.nvFatBinSegment.bss.comment$.o(8 p@JHo tUoH H 8do s }  0$x:x:::tg h h    0  h    0+˲X \ 8 H        x::h          h * 8 E [ j x x   @:: `e' @E    Ȳ ̲ в   5 G [ p    %" # x: E $ "+  1 :8 uq w  " 0*   `    !    ` 0 @        " + 1"BIR"d~  ",0H"Zo" )"E"Zf"{" 8 &1;"O\g"z"""call_gmon_startcrtstuff.c__CTOR_LIST____DTOR_LIST____JCR_LIST____do_global_dtors_auxcompleted.7424dtor_idx.7426frame_dummy__CTOR_END____FRAME_END____JCR_END____do_global_ctors_auxtmpxft_0000173b_00000000-1_MDKDE_Isotropic.cudafe1.cpp__deviceText_$sm_20$__deviceText_$compute_20$_ZL87__sti____cudaRegisterAll_50_tmpxft_0000173b_00000000_4_MDKDE_Isotropic_cpp1_ii_c79a3877v_ZL15__fatDeviceText_ZL26__cudaUnregisterBinaryUtilv_ZL20__cudaFatCubinHandle_ZL4DimC_ZL3N1C_ZL3N2C_ZL3BWC_ZZ46__device_stub__Z22MDKDE_Isotropic_K32x16PfS_S_PfS_S_E3__f_ZL12__ptxEntries_ZL14__cubinEntries_ZL15__debugEntries0_ZL13__elfEntries1_ZL13__elfEntries0mexversion.cDW.ref.__gxx_personality_v0_Z15MDKDE_IsotropicPfS_iiiS_S__Z20MDKDE_Isotropic_ProbPfS_iiiS__finierfcinv_GLOBAL_OFFSET_TABLE_rcbrtfllmaxatexit_Z46__device_stub__Z22MDKDE_Isotropic_K32x16PfS_S_PfS_S_sinpierfcinvfrcbrtrsqrt_Z22MDKDE_Isotropic_K32x16PfS_S___dso_handlersqrtf__DTOR_END__ullminsinpif__bss_startminmaxerfinvfullmax_enderfinv_edatallminumaxumin_DYNAMIC_initexp@@GLIBC_2.2.5mxGetNcudaFreeceil@@GLIBC_2.2.5mxCreateNumericMatrix_700__fprintf_chk@@GLIBC_2.3.4__cudaRegisterVar__gmon_start___Jv_RegisterClasses_Znam@@GLIBCXX_3.4log1p@@GLIBC_2.2.5exit@@GLIBC_2.2.5cublasShutdown__cudaRegisterFunctionMEXcudaMemcpyToSymbolAsynclog2@@GLIBC_2.2.5cudaGetSymbolAddresscudaMemcpyToSymbolcos@@GLIBC_2.2.5__cxa_atexit@@GLIBC_2.2.5mexPrintfcudaSetupArgumentcudaThreadSynchronizecudaConfigureCall__cudaUnregisterFatBinarycudaGetErrorStringcudaLaunch__cxa_finalize@@GLIBC_2.2.5__isnan@@GLIBC_2.2.5cublasSscal__isinf@@GLIBC_2.2.5cudaMemGetInfo__cudaRegisterFatBinarymexEvalStringexp2@@GLIBC_2.2.5_ZdaPv@@GLIBCXX_3.4ceilf@@GLIBC_2.2.5mxGetMdifftime@@GLIBC_2.2.5mexFunctionstderr@@GLIBC_2.2.5cublasInitmxGetDataremquo@@GLIBC_2.2.5mexErrMsgTxtcudaMemcpyfloor@@GLIBC_2.2.5cudaMalloc__gxx_personality_v0@@CXXABI_1.3mxIsSinglesqrt@@GLIBC_2.2.5sin@@GLIBC_2.2.5log@@GLIBC_2.2.5cudaMemcpyAsynctime@@GLIBC_2.2.5