#include "../gcnc.h" // The GCNC header file
//------------------------------------------------------------------------
// Skeleton program compiled by the OCL compiler
// Synchronizes the opencl system with the GCNC compiler
//------------------------------------------------------------------------
#ifndef _MIXCODE_LCC
kernel void lcctest(
global double *p,
global double *q,
global int *prb,
const int a,
const int b)
{
int x = get_global_id(0), y = get_global_id(1);
p[y] = q[prb[x]];
}
#endif
//------------------------------------------------------------------------
// A GCNC C program with some example syntax
//------------------------------------------------------------------------
#ifdef _MIXCODE_LCC
#define WAVE_BAR_0 1
#define N_WAVES 31
int real_function(int,int); // forward delaration of a callable functions
extern sregister unsigned long exec,vcc; // access to special register 'exec' , 'vcc'
extern sregister unsigned int scc; // access to special register 'scc'
extern sregister unsigned int m0; // access to special register 'm0'
//BUILT IN FUNCTIONS, INSTRUCTIONS, and ASM()
builtin float sin(float); // builtin function sin(x), units of 2*PI
builtin float sin_2pi(float); // builtin function 1 insn sin(x), units of 1.0
builtin float v_sin_f32(float); // builtin 'instruction' uses GCN insn names
asm("v_sin_f32 #0 #1",x,y); // inline assembly code
builtin float v_sin_f32(float); // by default, 'builtin' return vgprs
sbuiltin unsigned long mem_time(void); // get gpu clock, 'sbuiltin' returns sgprs
builtin double fma64(double,double,double); // many builtin functions
builtin uint bitcount(uint, uint); // builtin GCN bit op functions
builtin void ds_max_f64(double *,double,...); // lds fucntions, variable args for offsets
builtin void gs_max_f64(double *,double); // gds fucntions can use gd_ prefix
builtin int atomic_cmpswap(int,int); // builtin global atomics
builtin void tbuf_load(buf, yy, 3*8); // global read/write builtin functions
builtin void tbuf_load4_glc(buf, yy, 3*8); // global read/write functions use glc bit
#define CON_INT 123 // constant forms
#define CON_UINT 123U
#define CON_LONG 123L
#define CON_ULONG 123LU
#define CON_FLOAT 1.234f
#define CON_DOUBLE 1.234
struct tree{ // structures and pointers
struct tree *l,*r; // pointers to structures
int a:8,b:12,c:12;
int f;
};
//------------------------------------------------------------------------------------------
kernel void lcctest( //
global double * buf,
global double * out,
global int * _printf_, // buffer for fast printf
const int arg2,
const int arg3
)
{
register int gx,gy,gid; // private variables are type 'register'
int a,j,k,vgpr; // 'register' is assumed
uint ui,ret; // supports int, uint
register long longtime; // long, ulong
register char ch; // char, uchar
register short sh; // short, ushort
register float x,y,z; // float
register double xx,yy,zz; // double
register int array2d[3][2]; // constant indexed register arrays
sregister int sgpr1,sgpr2; // SGPRS are defined as 'sregister'
sregister long sgpr64,t2; // long SGPRS, s[22:23] are even aligned
local double localdouble; // local memory variable (lds)
local int localbuf[1024]; // local memory buffer (lds)
local int *lp; // pointer to local memory
local struct tree trees[4],*tp; // local structures and struct pointers
_gds int gdsval; // gds memory buffer and vars,
_gds long gdslong; // gds seen by all kernels
_gds int gds_buf[512]; // gds arrays
_gds double gdsdouble; // double floating point _gds value
_gds int *gp; // pointer to gds memory
global sregister double *sp; // pointer to global memory with SGPR
global register double *vp; // pointer to global memory with VGPR
register double *vp; // pointers assumed to point to global memory
gx = get_global_id(0); // basic opencl functions
gy = get_global_id(1); //
gid = 256 * gy + gx; // calculate global id
#define USE_GWS_BARRIER
#ifdef USE_GWS_BARRIER
ret=atomic_inc((uint *)buf,MAXU); // first addr = 0, ret=0 indicates first wave
if(ret == 0) // first wave initializes a gws global barrier.
gws_init(N_WAVES, WAVE_BAR_0); // .. for all 128 waves, used below
#endif
if(gid==3) //fast printf, runs near full speed
printf("hello amd = %d\n",gid); //prints to screen, uses global arg _printf_
sgpr64 = vcc; // read/write registers vcc, m0, exec, etc.
sgpr64 = exec; //save exec register
exec = 0xffffffffffffffffUL; // this is dangerous.
exec = sgpr64; // restore exec register
a = m0; // read the the m0 register.
yy = fma64(yy, xx, yy); // builtin math functions, eg, double fma
ui = 0xBEEFBABE; // builtin bit functions
j = bitcount(bitrev(ui), ffbitl(ui)); // no. of bits in a + value of first bit in a
localdouble = 1.7777; // 64 bit local memory
ds_max_f64(&localdouble, 3.3); // double atomic max in local memory
#define CFLOAT 1.2345678f // float constant
#define CDBL 1.2345678912345678 // double constant
ds_min_f64(&localdouble, CDBL, 4); // local atomics with optional offset
y = v_cos_f32(x); // builtin instructions use gcn assembler names
a = v_mad_i32_i24(j, a, 12); // most hardware instructions are available
z = v_add_f32(abs(x), -y); // gcn optimiztions for abs() and sign
k = v_subb_u32(vcc, a, j, vcc); // BIs use registers vcc, scc, etc
k = v_add_i32(sgpr1, a, j); // BIs use user's SGPR registers
precise24(); //int mul/div precision set to 24bits
k = a * j + d; // v_mad_i32_i24 1 clock.
precise32(); //precision returns to 32 bits
k = a * j + d; // v_mad_i32 4 clocks.
// inline assembly gcn instructions
asm("v_add_i32 #0, vcc, #1, 123",a,j); // referencing C register variables
asm("v_mul_f32 #0, abs(#2), -#1",x,y,x);// optimization available via the assembler
asm("v_nop");
asm(".message: here I am!");
// Expression optimization
y = abs(x) * y - abs(z); // this compiles to one instruction.
ui = (a & ~k) | (j & k); // Compiles to one instruction d = bfi(k, j, a);
// Compute z = sin(x)^2 + cos(x)^2
y = sin2pi(x); // sin2pi and cos2pi are native instructions
x = cos2pi(x); // both have domains in units of 2*PI
z = x*x + y*y; // Full computation takes 4 instructions
// global memory,
yy = buf[3]; // compiler generates waits for 'C' expressions
sp = &buf[3]; // pointer to global memory
yy = *sp; // read global memory
tbuf_load2(buf, yy, 3*8); // read using builtin function
tbuf_load2_glc(buf, yy, 3*8); // force the glc bit to bypass the cache
//...
vm_wait(0); // user added wait() for 'builtin' instructions
if(y > x + 3.3f)y=0.0; else y=x; // simple conditionals use v_cndmask instruction
if(y==x && i>3) // most conditionals use v_cmpx_(op), short code
for(a = 0; a < 7; a++){ // do something in a loop
for(j = 0; j < 7; j++)a = a + j; // do more
}
lp = &localbuf[3]; // lds and gds access, pointer to local memory
localbuf[4] = *(lp + 2); // localbuf[4] = localbuf[5]
gp = &gds_buf[3]; // pointer to _gds memory
gds_buf[4] = *(gp + 2); // gds_buf[4] = gds_buf[5]
//LDS/GDS builtin functionss
ds_atom_inc((uint *)&localbuf[4]); // atomic inc of global data store
ds_atom_inc((uint *)&gds_buf[8]); // ds functionss use lds or gds
ds_cmpst_f64(&localdouble, 1.22); // LDS low level instructions with gcn names
gs_cmpst_f64(&gdsdouble, 3.44, 8); // GDS instruction with optional offset
a = vgpr + sgpr1; // mixing SGPRs with VGPRs is allowed
j = gx & 0x3f; // readlane 1) get the local thread id in wave
sgpr1 = 8; // readlane 2) select lane no. 8 in sgpr1
sgpr2 = v_readlane_b32(j, sgpr1); // readlane 3) read j from thread 8 (lane 8)
j = sgpr2; // readlane 4) assign to j for all threads
tp=&trees[0]; // point to a structure
if(gx < 0)tp = (tp->l)->r; // structure pointer to pointer to ......
yy = localdouble; // read local double set at the top
yy = xx / yy; // full precision double divide
sgpr64 = mem_time(); // using the gpu clock to time insturctions
y = cos2pi(x); // measure clocks for this instruction
t2 = mem_time(); // most insns are 4 clocks, others are 6 or 8
ds_wait(0); // wait for timer to return
longtime = t2 - sgpr64; // most insns are 4 clocks, others are 6 or 8
#ifdef USE_GWS_BARRIER
gws_barrier(WAVE_BAR_0); // global wave barrier, everyone wait here!
#endif
j = real_function(a,0); // real stack function call
out[gid] = yy; // write something before leaving
}
int real_function(int a,int b) // a non-inline function, can be reentrant
{
int tmp; // local variables are temporary registers
tmp = a * b; // do something
return tmp; // return
}
#endif
//------------------------------------------------------------------------------------------
// Alternatively, programs are written using GCN ISA code
// Option 1, first compile a shell program using the OCL compiler and paste here
// Option 2, compile a shell program using the GCNC compiler and paste here
// Option 3, Write your own ISA program, requires in depth knowlege opencl internals
//------------------------------------------------------------------------------------------
#ifdef _MIXCODE_GCN
x_set_nvgpr 63
x_set_nsgpr 40
x_set_ldsmax 0x7fff
s_mov_b32 m0, 0x00008000
s_buffer_load_dwordx2 s[0:1], s[4:7], 0x04
s_waitcnt lgkmcnt(0)
s_mul_i32 s0, s12, s0
s_mul_i32 s1, s13, s1
v_add_i32 v0, vcc, s0, v0
v_add_i32 v1, vcc, s1, v1
s_load_dwordx4 s[20:23], s[2:3], 0x50
s_load_dwordx4 s[16:19], s[2:3], 0x58
s_buffer_load_dword s24, s[8:11], 0x00
s_buffer_load_dword s25, s[8:11], 0x04
s_buffer_load_dword s26, s[8:11], 0x08
s_buffer_load_dword s27, s[8:11], 0x0c
s_buffer_load_dword s28, s[8:11], 0x10
s_waitcnt lgkmcnt(0)
v_mov_b32 v5, v0
v_mov_b32 v6, v1
v_lshlrev_b32 v11, 8, v6
v_add_i32 v7, vcc, v5, v11
v_mov_b32 v11, 0x3333
v_mov_b32 v2, 0
s_waitcnt lgkmcnt(0)
v_lshlrev_b32 v11, 2, v7
v_add_i32 v11, vcc, s25, v11
s_waitcnt lgkmcnt(0)
v_mov_b32 v0,1.111
v_mov_b32 v1,7.0
v_mov_b32 v9,10.0
v_mac_f32 v9,v1,2.222
tb_store_fmt_x v9, v11, s[20:23], 0 offen format:TF_X
label_1:
s_endpgm
end
.end lcctest
#endif //_MIXCODE_GCN
//------------------------------------------------------------------------------------------
|