CRAN Package Check Results for Package clrng

Last updated on 2025-05-15 18:50:43 CEST.

Flavor Version Tinstall Tcheck Ttotal Status Flags
r-devel-linux-x86_64-debian-clang 0.0.5 128.28 44.60 172.88 OK
r-devel-linux-x86_64-debian-gcc 0.0.5 104.62 36.96 141.58 OK
r-patched-linux-x86_64 0.0.5 138.28 44.59 182.87 OK
r-release-linux-x86_64 0.0.5 142.70 44.21 186.91 OK
r-release-macos-arm64 0.0.5 107.00 OK
r-release-macos-x86_64 0.0.5 108.00 ERROR
r-oldrel-macos-arm64 0.0.5 94.00 NOTE
r-oldrel-macos-x86_64 0.0.5 126.00 ERROR

Check Details

Version: 0.0.5
Check: examples
Result: ERROR Running examples in ‘clrng-Ex.R’ failed The error most likely occurred in: > ### Name: rnormGpu > ### Title: rnormGpu > ### Aliases: rnormGpu > > ### ** Examples > > library(clrng) > if (detectGPUs() >= 1) { + setContext(grep("gpu", listContexts()$device_type)[1]) + currentPlatform() + streams <- createStreamsGpu() + as.vector(rnormGpu(7, streams=streams)) + + getOption('clrng.Nglobal') + # use float precision and global size + as.matrix(rnormGpu(c(2,3), streams=streams))} else { + message("No GPU context available") + } Build Status = -2 ( Err = -11 ) Log: <program source>:22:6: warning: no previous prototype for function 'streamsToPrivate' void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){ ^ <program source>:30:6: warning: no previous prototype for function 'streamsFromPrivate' void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){ ^ <program source>:38:6: warning: no previous prototype for function 'clrngMrg31k3pNextState' uint clrngMrg31k3pNextState(uint *g1, uint *g2) { ^ <program source>:97:14: error: call to '__cl_sqrt' is ambiguous temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1 ^~~~~~~~~~~~~~~~~~~~~~~~~ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4788:23: note: expanded from macro 'sqrt' #define sqrt(__x) __cl_sqrt(__x) ^~~~~~~~~ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function __CLFN_FD_1FD(__cl_sqrt); ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:367:48: note: expanded from macro '__CLFN_FD_1FD' #define __CLFN_FD_1FD(name) float __OVERLOAD__ name(float x); \ ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:368:21: note: expanded from macro '__CLFN_FD_1FD' float2 __OVERLOAD__ name(float2 x); \ ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:369:21: note: expanded from macro '__CLFN_FD_1FD' float3 __OVERLOAD__ name(float3 x); \ ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:370:21: note: expanded from macro '__CLFN_FD_1FD' float4 __OVERLOAD__ name(float4 x); \ ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:371:21: note: expanded from macro '__CLFN_FD_1FD' float8 __OVERLOAD__ name(float8 x); \ ^ /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function /System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:372:22: note: expanded from macro '__CLFN_FD_1FD' float16 __OVERLOAD__ name(float16 x); ^ Sources: #define TWOPI 6.28318530717 #define PI_2 M_PI_2_F #define mrg31k3p_NORM_cl 4.6566126e-10 //TWOPI * mrg31k3p_NORM_cl #define TWOPI_mrg31k3p_NORM_cl 2.9258361e-09 #define Nrow 1 #define Ncol 7 #define NpadStreams 128 #define NpadCol 128 #define mrg31k3p_M1 2147483647 #define mrg31k3p_M2 2147462579 #define mrg31k3p_MASK12 511 #define mrg31k3p_MASK13 16777215 #define mrg31k3p_MASK2 65535 #define mrg31k3p_MULT2 21069 void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){ int Drow, Dcol, DrowStart; for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3; Drow < 3; Drow++, DrowStart++, Dcol++){ g1[Drow] = streams[DrowStart]; g2[Drow] = streams[Dcol]; } } void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){ int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3; Drow < 3; Drow++, DrowStart++, Dcol++){ streams[DrowStart] = g1[Drow]; streams[Dcol] = g2[Drow]; } } uint clrngMrg31k3pNextState(uint *g1, uint *g2) { uint y1, y2; y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9) + ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24); if (y1 >= mrg31k3p_M1) y1 -= mrg31k3p_M1; y1 += g1[2]; if (y1 >= mrg31k3p_M1) y1 -= mrg31k3p_M1; g1[2] = g1[1]; g1[1] = g1[0]; g1[0] = y1; y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16)); if (y1 >= mrg31k3p_M2) y1 -= mrg31k3p_M2; y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16)); if (y2 >= mrg31k3p_M2) y2 -= mrg31k3p_M2; y2 += g2[2]; if (y2 >= mrg31k3p_M2) y2 -= mrg31k3p_M2; y2 += y1; if (y2 >= mrg31k3p_M2) y2 -= mrg31k3p_M2; g2[2] = g2[1]; g2[1] = g2[0]; g2[0] = y2; if (g1[0] <= g2[0]){ return (g1[0] - g2[0] + mrg31k3p_M1); } else { return(g1[0] - g2[0]); } } __kernel void mrg31k3pMatrix( __global int* streams, __global float* out){ const int index = get_global_id(0)*get_global_size(1) + get_global_id(1); int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds; const int DrowStartInc = get_global_size(0) * NpadCol; uint g1[3], g2[3]; const int startvalue=index * NpadStreams; float temp; const float fact[2] = { mrg31k3p_NORM_cl, TWOPI * mrg31k3p_NORM_cl }; const float addForSine[2] = { 0.0, - PI_2 }; local float part[2]; streamsToPrivate(streams,g1,g2,startvalue); for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol; DrowBlock < Nrow; Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) { DrowInBounds = Drow < Nrow; for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol; DcolBlock < Ncol; DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) { part[get_local_id(1)] = fact[get_local_id(1)] * clrngMrg31k3pNextState(g1, g2); barrier(CLK_LOCAL_MEM_FENCE); temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1 if(DrowInBounds) out[Dentry] = temp; barrier(CLK_LOCAL_MEM_FENCE); }//Dcol }//Drow streamsFromPrivate(streams,g1,g2,startvalue); }//kernel Error: ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE. If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information: * Operating System * Which OpenCL implementation (AMD, NVIDIA, etc.) * ViennaCL version Many thanks in advance! Execution halted Flavor: r-release-macos-x86_64

Version: 0.0.5
Check: re-building of vignette outputs
Result: ERROR Error(s) in re-building vignettes: --- re-building ‘createStreams.Rmd’ using knitr --- finished re-building ‘createStreams.Rmd’ --- re-building ‘fisher.sim.Rmd’ using knitr --- finished re-building ‘fisher.sim.Rmd’ --- re-building ‘random_numbers.Rmd’ using knitr Quitting from random_numbers.Rmd:62-96 [rnormGpu] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <error/rlang_error> Error: ! ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE. If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information: * Operating System * Which OpenCL implementation (AMD, NVIDIA, etc.) * ViennaCL version Many thanks in advance! --- Backtrace: ▆ 1. ├─base::as.vector(clrng::rnormGpu(10, myStreamsGpu, verbose = 2)) 2. └─clrng::rnormGpu(10, myStreamsGpu, verbose = 2) 3. └─clrng:::gpuRnBackend(xVcl, streams, Nglobal, "normal", verbose) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Error: processing vignette 'random_numbers.Rmd' failed with diagnostics: ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE. If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information: * Operating System * Which OpenCL implementation (AMD, NVIDIA, etc.) * ViennaCL version Many thanks in advance! --- failed re-building ‘random_numbers.Rmd’ SUMMARY: processing the following file failed: ‘random_numbers.Rmd’ Error: Vignette re-building failed. Execution halted Flavor: r-release-macos-x86_64

Version: 0.0.5
Check: installed package size
Result: NOTE installed size is 11.5Mb sub-directories of 1Mb or more: libs 11.3Mb Flavors: r-oldrel-macos-arm64, r-oldrel-macos-x86_64

Version: 0.0.5
Check: examples
Result: ERROR Running examples in ‘clrng-Ex.R’ failed The error most likely occurred in: > ### Name: createStreamsCpu > ### Title: createStreamsCpu > ### Aliases: createStreamsCpu > > ### ** Examples > > library(clrng) > if (detectGPUs() >= 1) { + t(createStreamsCpu(n=5)) + ## GPU streams + myStreamsGpu = vclMatrix(createStreamsCpu(n=4)) }else { + message("No GPU context available") + } Build Status = -2 ( Err = -11 ) Log: CVMS_ERROR_INVALID_FILE_DES: Invalid file descriptor. Sources: __kernel void am_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2) { int alpha = fac2; if (options2 & (1 << 0)) alpha = -alpha; if (options2 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha ; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha ; } } __kernel void am_gpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2) { int alpha = fac2[0]; if (options2 & (1 << 0)) alpha = -alpha; if (options2 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha ; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha ; } } __kernel void ambm_cpu_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, int fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_cpu_gpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, __global int * fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3[0]; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_gpu_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, int fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2[0]; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_gpu_gpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, __global int * fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2[0]; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3[0]; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_m_cpu_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, int fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_m_cpu_gpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, __global int * fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3[0]; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_m_gpu_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, int fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2[0]; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void ambm_m_gpu_gpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * fac2, unsigned int options2, __global const int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, __global int * fac3, unsigned int options3, __global const int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2 ) { int alpha = fac2[0]; if (options2 & (1 << 0)) alpha = -alpha; int beta = fac3[0]; if (options3 & (1 << 0)) beta = -beta; if (options2 & (1 << 1)) { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } else { if (options3 & (1 << 1)) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta; } else { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta; } } } __kernel void assign_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int alpha) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; } __kernel void diagonal_assign_cpu( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int alpha) { for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0)) A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; } __kernel void element_op( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, __global int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) { int factor = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)]; int power = C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; int val = (power >= 0) ? 1 : 0; for (int p = 0; p < power; ++p) val *= factor; A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val; } } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; } else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; }} __kernel void element_op_va( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, __global int * B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, int alpha, unsigned int op_type) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) { int factor = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)]; int power = alpha; int val = (power >= 0) ? 1 : 0; for (int p = 0; p < power; ++p) val *= factor; A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val; } } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha; } else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha; }} __kernel void element_op_av( __global int * A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, int alpha, __global int * C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) { int factor = alpha; int power = C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; int val = (power >= 0) ? 1 : 0; for (int p = 0; p < power; ++p) val *= factor; A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val; } } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha / C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; } else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha * C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; }} __kernel void trans_vec_mul( __global const int * A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, __global const int * v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, __global int * result, unsigned int result_start, unsigned int result_inc, unsigned int result_size, __local int * work) { for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) { int dot_prod = 0; for (unsigned int col = 0; col < A_row_size; ++col) dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; result[row * result_inc + result_start] = dot_prod; } } __kernel void vec_mul( __global const int * A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, __global const int * v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, __global int * result, unsigned int result_start, unsigned int result_inc, unsigned int result_size, __local int * work) { unsigned int row_gid = get_global_id(0) / get_local_size(0); unsigned int col_gid = get_global_id(0) % get_local_size(0); unsigned int lid = get_local_id(0); for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) { int dot_prod = 0; for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; work[lid] = dot_prod; for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ barrier(CLK_LOCAL_MEM_FENCE); if(lid < stride) work[lid] += work[lid+stride]; } if(lid == 0) result[row * result_inc + result_start] = work[0]; } } Error in (function (cond) : error in evaluating the argument 'x' in selecting a method for function 't': ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE. If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information: * Operating System * Which OpenCL implementation (AMD, NVIDIA, etc.) * ViennaCL version Many thanks in advance! Calls: t ... new -> initialize -> initialize -> cpp_scalar_vclMatrix Execution halted Flavor: r-oldrel-macos-x86_64