Last updated on 2025-05-15 18:50:43 CEST.
Flavor | Version | Tinstall | Tcheck | Ttotal | Status | Flags |
---|---|---|---|---|---|---|
r-devel-linux-x86_64-debian-clang | 0.0.5 | 128.28 | 44.60 | 172.88 | OK | |
r-devel-linux-x86_64-debian-gcc | 0.0.5 | 104.62 | 36.96 | 141.58 | OK | |
r-patched-linux-x86_64 | 0.0.5 | 138.28 | 44.59 | 182.87 | OK | |
r-release-linux-x86_64 | 0.0.5 | 142.70 | 44.21 | 186.91 | OK | |
r-release-macos-arm64 | 0.0.5 | 107.00 | OK | |||
r-release-macos-x86_64 | 0.0.5 | 108.00 | ERROR | |||
r-oldrel-macos-arm64 | 0.0.5 | 94.00 | NOTE | |||
r-oldrel-macos-x86_64 | 0.0.5 | 126.00 | ERROR |
Version: 0.0.5
Check: examples
Result: ERROR
Running examples in ‘clrng-Ex.R’ failed
The error most likely occurred in:
> ### Name: rnormGpu
> ### Title: rnormGpu
> ### Aliases: rnormGpu
>
> ### ** Examples
>
> library(clrng)
> if (detectGPUs() >= 1) {
+ setContext(grep("gpu", listContexts()$device_type)[1])
+ currentPlatform()
+ streams <- createStreamsGpu()
+ as.vector(rnormGpu(7, streams=streams))
+
+ getOption('clrng.Nglobal')
+ # use float precision and global size
+ as.matrix(rnormGpu(c(2,3), streams=streams))} else {
+ message("No GPU context available")
+ }
Build Status = -2 ( Err = -11 )
Log: <program source>:22:6: warning: no previous prototype for function 'streamsToPrivate'
void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
^
<program source>:30:6: warning: no previous prototype for function 'streamsFromPrivate'
void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){
^
<program source>:38:6: warning: no previous prototype for function 'clrngMrg31k3pNextState'
uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
^
<program source>:97:14: error: call to '__cl_sqrt' is ambiguous
temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1
^~~~~~~~~~~~~~~~~~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4788:23: note: expanded from macro 'sqrt'
#define sqrt(__x) __cl_sqrt(__x)
^~~~~~~~~
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
__CLFN_FD_1FD(__cl_sqrt);
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:367:48: note: expanded from macro '__CLFN_FD_1FD'
#define __CLFN_FD_1FD(name) float __OVERLOAD__ name(float x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:368:21: note: expanded from macro '__CLFN_FD_1FD'
float2 __OVERLOAD__ name(float2 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:369:21: note: expanded from macro '__CLFN_FD_1FD'
float3 __OVERLOAD__ name(float3 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:370:21: note: expanded from macro '__CLFN_FD_1FD'
float4 __OVERLOAD__ name(float4 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:371:21: note: expanded from macro '__CLFN_FD_1FD'
float8 __OVERLOAD__ name(float8 x); \
^
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:4785:15: note: candidate function
/System/Library/PrivateFrameworks/GPUCompiler.framework/Versions/A/lib/clang/3.5/include/cl_kernel.h:372:22: note: expanded from macro '__CLFN_FD_1FD'
float16 __OVERLOAD__ name(float16 x);
^
Sources:
#define TWOPI 6.28318530717
#define PI_2 M_PI_2_F
#define mrg31k3p_NORM_cl 4.6566126e-10
//TWOPI * mrg31k3p_NORM_cl
#define TWOPI_mrg31k3p_NORM_cl 2.9258361e-09
#define Nrow 1
#define Ncol 7
#define NpadStreams 128
#define NpadCol 128
#define mrg31k3p_M1 2147483647
#define mrg31k3p_M2 2147462579
#define mrg31k3p_MASK12 511
#define mrg31k3p_MASK13 16777215
#define mrg31k3p_MASK2 65535
#define mrg31k3p_MULT2 21069
void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
int Drow, Dcol, DrowStart; for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3;
Drow < 3; Drow++, DrowStart++, Dcol++){
g1[Drow] = streams[DrowStart];
g2[Drow] = streams[Dcol];
}
}
void streamsFromPrivate(__global int* streams, uint* g1, uint* g2, const int start){
int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3;
Drow < 3; Drow++, DrowStart++, Dcol++){
streams[DrowStart] = g1[Drow];
streams[Dcol] = g2[Drow];
}
}
uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
uint y1, y2;
y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9)
+ ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24);
if (y1 >= mrg31k3p_M1)
y1 -= mrg31k3p_M1;
y1 += g1[2];
if (y1 >= mrg31k3p_M1)
y1 -= mrg31k3p_M1;
g1[2] = g1[1];
g1[1] = g1[0];
g1[0] = y1;
y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16));
if (y1 >= mrg31k3p_M2)
y1 -= mrg31k3p_M2;
y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16));
if (y2 >= mrg31k3p_M2)
y2 -= mrg31k3p_M2;
y2 += g2[2];
if (y2 >= mrg31k3p_M2)
y2 -= mrg31k3p_M2;
y2 += y1;
if (y2 >= mrg31k3p_M2)
y2 -= mrg31k3p_M2;
g2[2] = g2[1];
g2[1] = g2[0];
g2[0] = y2;
if (g1[0] <= g2[0]){
return (g1[0] - g2[0] + mrg31k3p_M1);
} else {
return(g1[0] - g2[0]);
}
}
__kernel void mrg31k3pMatrix(
__global int* streams,
__global float* out){
const int index = get_global_id(0)*get_global_size(1) + get_global_id(1);
int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds;
const int DrowStartInc = get_global_size(0) * NpadCol;
uint g1[3], g2[3];
const int startvalue=index * NpadStreams;
float temp;
const float fact[2] = { mrg31k3p_NORM_cl, TWOPI * mrg31k3p_NORM_cl };
const float addForSine[2] = { 0.0, - PI_2 };
local float part[2];
streamsToPrivate(streams,g1,g2,startvalue);
for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol;
DrowBlock < Nrow;
Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) {
DrowInBounds = Drow < Nrow;
for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol;
DcolBlock < Ncol;
DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) {
part[get_local_id(1)] = fact[get_local_id(1)] * clrngMrg31k3pNextState(g1, g2);
barrier(CLK_LOCAL_MEM_FENCE);
temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1
if(DrowInBounds) out[Dentry] = temp;
barrier(CLK_LOCAL_MEM_FENCE);
}//Dcol
}//Drow
streamsFromPrivate(streams,g1,g2,startvalue);
}//kernel
Error: ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE.
If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information:
* Operating System
* Which OpenCL implementation (AMD, NVIDIA, etc.)
* ViennaCL version
Many thanks in advance!
Execution halted
Flavor: r-release-macos-x86_64
Version: 0.0.5
Check: re-building of vignette outputs
Result: ERROR
Error(s) in re-building vignettes:
--- re-building ‘createStreams.Rmd’ using knitr
--- finished re-building ‘createStreams.Rmd’
--- re-building ‘fisher.sim.Rmd’ using knitr
--- finished re-building ‘fisher.sim.Rmd’
--- re-building ‘random_numbers.Rmd’ using knitr
Quitting from random_numbers.Rmd:62-96 [rnormGpu]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<error/rlang_error>
Error:
! ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE.
If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information:
* Operating System
* Which OpenCL implementation (AMD, NVIDIA, etc.)
* ViennaCL version
Many thanks in advance!
---
Backtrace:
▆
1. ├─base::as.vector(clrng::rnormGpu(10, myStreamsGpu, verbose = 2))
2. └─clrng::rnormGpu(10, myStreamsGpu, verbose = 2)
3. └─clrng:::gpuRnBackend(xVcl, streams, Nglobal, "normal", verbose)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Error: processing vignette 'random_numbers.Rmd' failed with diagnostics:
ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE.
If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information:
* Operating System
* Which OpenCL implementation (AMD, NVIDIA, etc.)
* ViennaCL version
Many thanks in advance!
--- failed re-building ‘random_numbers.Rmd’
SUMMARY: processing the following file failed:
‘random_numbers.Rmd’
Error: Vignette re-building failed.
Execution halted
Flavor: r-release-macos-x86_64
Version: 0.0.5
Check: installed package size
Result: NOTE
installed size is 11.5Mb
sub-directories of 1Mb or more:
libs 11.3Mb
Flavors: r-oldrel-macos-arm64, r-oldrel-macos-x86_64
Version: 0.0.5
Check: examples
Result: ERROR
Running examples in ‘clrng-Ex.R’ failed
The error most likely occurred in:
> ### Name: createStreamsCpu
> ### Title: createStreamsCpu
> ### Aliases: createStreamsCpu
>
> ### ** Examples
>
> library(clrng)
> if (detectGPUs() >= 1) {
+ t(createStreamsCpu(n=5))
+ ## GPU streams
+ myStreamsGpu = vclMatrix(createStreamsCpu(n=4)) }else {
+ message("No GPU context available")
+ }
Build Status = -2 ( Err = -11 )
Log: CVMS_ERROR_INVALID_FILE_DES: Invalid file descriptor.
Sources: __kernel void am_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2) {
int alpha = fac2;
if (options2 & (1 << 0))
alpha = -alpha;
if (options2 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha ;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha ;
}
}
__kernel void am_gpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2) {
int alpha = fac2[0];
if (options2 & (1 << 0))
alpha = -alpha;
if (options2 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha ;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha ;
}
}
__kernel void ambm_cpu_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
int fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2;
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3;
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_cpu_gpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
__global int * fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2;
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3[0];
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_gpu_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
int fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2[0];
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3;
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_gpu_gpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
__global int * fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2[0];
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3[0];
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_m_cpu_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
int fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2;
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3;
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_m_cpu_gpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
__global int * fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2;
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3[0];
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_m_gpu_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
int fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2[0];
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3;
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void ambm_m_gpu_gpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * fac2,
unsigned int options2,
__global const int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
__global int * fac3,
unsigned int options3,
__global const int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2
) {
int alpha = fac2[0];
if (options2 & (1 << 0))
alpha = -alpha;
int beta = fac3[0];
if (options3 & (1 << 0))
beta = -beta;
if (options2 & (1 << 1)) {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
} else {
if (options3 & (1 << 1)) {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] / beta;
} else {
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] += B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * alpha + C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] * beta;
}
}
}
__kernel void assign_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int alpha)
{
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha;
}
__kernel void diagonal_assign_cpu(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int alpha)
{
for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))
A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha;
}
__kernel void element_op(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
__global int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2,
unsigned int op_type)
{
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
{ int factor = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)];
int power = C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
int val = (power >= 0) ? 1 : 0;
for (int p = 0; p < power; ++p)
val *= factor;
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val;
} } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] /
C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
} else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] *
C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
}}
__kernel void element_op_va(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
__global int * B,
unsigned int B_start1, unsigned int B_start2,
unsigned int B_inc1, unsigned int B_inc2,
unsigned int B_internal_size1, unsigned int B_internal_size2,
int alpha,
unsigned int op_type)
{
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
{ int factor = B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)];
int power = alpha;
int val = (power >= 0) ? 1 : 0;
for (int p = 0; p < power; ++p)
val *= factor;
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val;
} } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] /
alpha;
} else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] *
alpha;
}}
__kernel void element_op_av(
__global int * A,
unsigned int A_start1, unsigned int A_start2,
unsigned int A_inc1, unsigned int A_inc2,
unsigned int A_size1, unsigned int A_size2,
unsigned int A_internal_size1, unsigned int A_internal_size2,
int alpha,
__global int * C,
unsigned int C_start1, unsigned int C_start2,
unsigned int C_inc1, unsigned int C_inc2,
unsigned int C_internal_size1, unsigned int C_internal_size2,
unsigned int op_type)
{
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
if (op_type == 2) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
{ int factor = alpha;
int power = C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
int val = (power >= 0) ? 1 : 0;
for (int p = 0; p < power; ++p)
val *= factor;
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = val;
} } else if (op_type == 1) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
alpha /
C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
} else if (op_type == 0) { for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))
for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))
A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] =
alpha *
C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)];
}}
__kernel void trans_vec_mul(
__global const int * A,
unsigned int A_row_start, unsigned int A_col_start,
unsigned int A_row_inc, unsigned int A_col_inc,
unsigned int A_row_size, unsigned int A_col_size,
unsigned int A_internal_rows, unsigned int A_internal_cols,
__global const int * v,
unsigned int v_start, unsigned int v_inc, unsigned int v_size,
__global int * result,
unsigned int result_start, unsigned int result_inc, unsigned int result_size,
__local int * work)
{
for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0))
{
int dot_prod = 0;
for (unsigned int col = 0; col < A_row_size; ++col)
dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
result[row * result_inc + result_start] = dot_prod;
}
}
__kernel void vec_mul(
__global const int * A,
unsigned int A_row_start, unsigned int A_col_start,
unsigned int A_row_inc, unsigned int A_col_inc,
unsigned int A_row_size, unsigned int A_col_size,
unsigned int A_internal_rows, unsigned int A_internal_cols,
__global const int * v,
unsigned int v_start, unsigned int v_inc, unsigned int v_size,
__global int * result,
unsigned int result_start, unsigned int result_inc, unsigned int result_size,
__local int * work)
{
unsigned int row_gid = get_global_id(0) / get_local_size(0);
unsigned int col_gid = get_global_id(0) % get_local_size(0);
unsigned int lid = get_local_id(0);
for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0))
{
int dot_prod = 0;
for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0))
dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
work[lid] = dot_prod;
for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < stride)
work[lid] += work[lid+stride];
}
if(lid == 0)
result[row * result_inc + result_start] = work[0];
}
}
Error in (function (cond) :
error in evaluating the argument 'x' in selecting a method for function 't': ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE.
If you think that this is a bug in ViennaCL, please report it at viennacl-support@lists.sourceforge.net and supply at least the following information:
* Operating System
* Which OpenCL implementation (AMD, NVIDIA, etc.)
* ViennaCL version
Many thanks in advance!
Calls: t ... new -> initialize -> initialize -> cpp_scalar_vclMatrix
Execution halted
Flavor: r-oldrel-macos-x86_64