| R Allan Barker science | technology | history | philosophy + curiosity | |
| Last update December 5, 2013 |
Contact me |
| GCN GCN, Graphics Core Next, is AMD's new processor architecture used in GPUs and integrated CPU/graphics processors. For anyone who enjoys the power of lower level programming, GCN ia all new and amazingly powerful. At the assembly level, programming is similar to traditional CPUs but uses two interwoven instruction sets, one for vector operations and one for scalars. The two run in parallel and access common resources. Both have rich and powerful instructions like single cycle floating point transcendentals and a wide range of bit operations. Forget twiddling bits, now you can bash them. The problem is you cannot access GCN's full power. GCNC compiler and assembler, extending OPENCL Currently the only way to program GCN gpus is with openCL, which is hardware architecture independent leaving much of GCN's power inaccessible. OpenCL doesn't even support global synchronization, its roots come from graphics programming. GCNC implements a traditional C/assembler programming environment that compiles directly to GCN machine code. C and/or assembly (ISA) programs are compiled to binary and run from the user's host program like OpenCL, using the OpenCL environment. Source code files can contain sections with GCNC C, ISA assembly, and AMD's IL code. Compilation is directed by simple #define statements in the host program, for which I use mingw gcc C. GCNC is based on the wonderful LCC compiler and is still under development. LCC provides very good optimization for its age but works on a different paradigm than modern compilers like AMD's openCL. While modern compilers often fully rewrite programs, GCNC optimizes the code the programmer writes. Optimization is a dual effort between programmer and compiler allowing the programmer to control the output, just like the good old days. GCNC fully opens the GCN architecture by including low level built in functions, instructions, register access, and inline assembly. The assembler implements the full GCN instruction set and does assembly level optimization to find the best GCN instruction formats. It also assembles AMD's GCN assembly code. Like C, GCNC started as a way to avoid coding in assembly yet maintain a strong link to hardware. As a project, it has been a daunting task to adapt to the full scope of a parallel environment like GCN. Currently, the pace of development is based mostly on need, although whim is often considered a need. While it implements C, complex syntax can still uncover areas that need further attention. I have now used GCNC for several substantial projects and the results have better than expected. Although AMD's openCL may write better code, the programmer's control over the output seems more important to producing very high performance code. It was my original hope to eventually share the development in some reasonable way, and I would still like to see that happen. Otherwise, it serves as one person's opinion of how to access the low level power of GCN. Below is a file showing the basic C syntax with examples of all 4 types of code, openCL, GCNC, assembly ISA, and AMD IL. |
#include "../gcnc.h" // The GCNC header file
//------------------------------------------------------------------------
// Skeleton program compiled by the OCL compiler
// Synchronizes the opencl system with the GCNC compiler
//------------------------------------------------------------------------
#ifndef _MIXCODE_LCC
kernel void lcctest(
global double *p,
global double *q,
global int *prb,
const int a,
const int b)
{
int x = get_global_id(0), y = get_global_id(1);
p[y] = q[prb[x]];
}
#endif
//------------------------------------------------------------------------
// A GCNC C program with some example syntax
//------------------------------------------------------------------------
#ifdef _MIXCODE_LCC
#define WAVE_BAR_0 1
#define N_WAVES 31
int real_function(int,int); // forward delaration of a callable functions
extern sregister unsigned long exec,vcc; // access to special register 'exec' , 'vcc'
extern sregister unsigned int scc; // access to special register 'scc'
extern sregister unsigned int m0; // access to special register 'm0'
//BUILT IN FUNCTIONS, INSTRUCTIONS, and ASM()
builtin float sin(float); // builtin function sin(x), units of 2*PI
builtin float sin_2pi(float); // builtin function 1 insn sin(x), units of 1.0
builtin float v_sin_f32(float); // builtin 'instruction' uses GCN insn names
asm("v_sin_f32 #0 #1",x,y); // inline assembly code
builtin float v_sin_f32(float); // by default, 'builtin' return vgprs
sbuiltin unsigned long mem_time(void); // get gpu clock, 'sbuiltin' returns sgprs
builtin double fma64(double,double,double); // many builtin functions
builtin uint bitcount(uint, uint); // builtin GCN bit op functions
builtin void ds_max_f64(double *,double,...); // lds fucntions, variable args for offsets
builtin void gs_max_f64(double *,double); // gds fucntions can use gd_ prefix
builtin int atomic_cmpswap(int,int); // builtin global atomics
builtin void tbuf_load(buf, yy, 3*8); // global read/write builtin functions
builtin void tbuf_load4_glc(buf, yy, 3*8); // global read/write functions use glc bit
#define CON_INT 123 // constant forms
#define CON_UINT 123U
#define CON_LONG 123L
#define CON_ULONG 123LU
#define CON_FLOAT 1.234f
#define CON_DOUBLE 1.234
struct tree{ // structures and pointers
struct tree *l,*r; // pointers to structures
int a:8,b:12,c:12;
int f;
};
//------------------------------------------------------------------------------------------
kernel void lcctest( //
global double * buf,
global double * out,
global int * _printf_, // buffer for fast printf
const int arg2,
const int arg3
)
{
register int gx,gy,gid; // private variables are type 'register'
int a,j,k,vgpr; // 'register' is assumed
uint ui,ret; // supports int, uint
register long longtime; // long, ulong
register char ch; // char, uchar
register short sh; // short, ushort
register float x,y,z; // float
register double xx,yy,zz; // double
register int array2d[3][2]; // constant indexed register arrays
sregister int sgpr1,sgpr2; // SGPRS are defined as 'sregister'
sregister long sgpr64,t2; // long SGPRS, s[22:23] are even aligned
local double localdouble; // local memory variable (lds)
local int localbuf[1024]; // local memory buffer (lds)
local int *lp; // pointer to local memory
local struct tree trees[4],*tp; // local structures and struct pointers
_gds int gdsval; // gds memory buffer and vars,
_gds long gdslong; // gds seen by all kernels
_gds int gds_buf[512]; // gds arrays
_gds double gdsdouble; // double floating point _gds value
_gds int *gp; // pointer to gds memory
global sregister double *sp; // pointer to global memory with SGPR
global register double *vp; // pointer to global memory with VGPR
register double *vp; // pointers assumed to point to global memory
gx = get_global_id(0); // basic opencl functions
gy = get_global_id(1); //
gid = 256 * gy + gx; // calculate global id
#define USE_GWS_BARRIER
#ifdef USE_GWS_BARRIER
ret=atomic_inc((uint *)buf,MAXU); // first addr = 0, ret=0 indicates first wave
if(ret == 0) // first wave initializes a gws global barrier.
gws_init(N_WAVES, WAVE_BAR_0); // .. for all 128 waves, used below
#endif
if(gid==3) //fast printf, runs near full speed
printf("hello amd = %d\n",gid); //prints to screen, uses global arg _printf_
sgpr64 = vcc; // read/write registers vcc, m0, exec, etc.
sgpr64 = exec; //save exec register
exec = 0xffffffffffffffffUL; // this is dangerous.
exec = sgpr64; // restore exec register
a = m0; // read the the m0 register.
yy = fma64(yy, xx, yy); // builtin math functions, eg, double fma
ui = 0xBEEFBABE; // builtin bit functions
j = bitcount(bitrev(ui), ffbitl(ui)); // no. of bits in a + value of first bit in a
localdouble = 1.7777; // 64 bit local memory
ds_max_f64(&localdouble, 3.3); // double atomic max in local memory
#define CFLOAT 1.2345678f // float constant
#define CDBL 1.2345678912345678 // double constant
ds_min_f64(&localdouble, CDBL, 4); // local atomics with optional offset
y = v_cos_f32(x); // builtin instructions use gcn assembler names
a = v_mad_i32_i24(j, a, 12); // most hardware instructions are available
z = v_add_f32(abs(x), -y); // gcn optimiztions for abs() and sign
k = v_subb_u32(vcc, a, j, vcc); // BIs use registers vcc, scc, etc
k = v_add_i32(sgpr1, a, j); // BIs use user's SGPR registers
precise24(); //int mul/div precision set to 24bits
k = a * j + d; // v_mad_i32_i24 1 clock.
precise32(); //precision returns to 32 bits
k = a * j + d; // v_mad_i32 4 clocks.
// inline assembly gcn instructions
asm("v_add_i32 #0, vcc, #1, 123",a,j); // referencing C register variables
asm("v_mul_f32 #0, abs(#2), -#1",x,y,x);// optimization available via the assembler
asm("v_nop");
asm(".message: here I am!");
// Expression optimization
y = abs(x) * y - abs(z); // this compiles to one instruction.
ui = (a & ~k) | (j & k); // Compiles to one instruction d = bfi(k, j, a);
// Compute z = sin(x)^2 + cos(x)^2
y = sin2pi(x); // sin2pi and cos2pi are native instructions
x = cos2pi(x); // both have domains in units of 2*PI
z = x*x + y*y; // Full computation takes 4 instructions
// global memory,
yy = buf[3]; // compiler generates waits for 'C' expressions
sp = &buf[3]; // pointer to global memory
yy = *sp; // read global memory
tbuf_load2(buf, yy, 3*8); // read using builtin function
tbuf_load2_glc(buf, yy, 3*8); // force the glc bit to bypass the cache
//...
vm_wait(0); // user added wait() for 'builtin' instructions
if(y > x + 3.3f)y=0.0; else y=x; // simple conditionals use v_cndmask instruction
if(y==x && i>3) // most conditionals use v_cmpx_(op), short code
for(a = 0; a < 7; a++){ // do something in a loop
for(j = 0; j < 7; j++)a = a + j; // do more
}
lp = &localbuf[3]; // lds and gds access, pointer to local memory
localbuf[4] = *(lp + 2); // localbuf[4] = localbuf[5]
gp = &gds_buf[3]; // pointer to _gds memory
gds_buf[4] = *(gp + 2); // gds_buf[4] = gds_buf[5]
//LDS/GDS builtin functionss
ds_atom_inc((uint *)&localbuf[4]); // atomic inc of global data store
ds_atom_inc((uint *)&gds_buf[8]); // ds functionss use lds or gds
ds_cmpst_f64(&localdouble, 1.22); // LDS low level instructions with gcn names
gs_cmpst_f64(&gdsdouble, 3.44, 8); // GDS instruction with optional offset
a = vgpr + sgpr1; // mixing SGPRs with VGPRs is allowed
j = gx & 0x3f; // readlane 1) get the local thread id in wave
sgpr1 = 8; // readlane 2) select lane no. 8 in sgpr1
sgpr2 = v_readlane_b32(j, sgpr1); // readlane 3) read j from thread 8 (lane 8)
j = sgpr2; // readlane 4) assign to j for all threads
tp=&trees[0]; // point to a structure
if(gx < 0)tp = (tp->l)->r; // structure pointer to pointer to ......
yy = localdouble; // read local double set at the top
yy = xx / yy; // full precision double divide
sgpr64 = mem_time(); // using the gpu clock to time insturctions
y = cos2pi(x); // measure clocks for this instruction
t2 = mem_time(); // most insns are 4 clocks, others are 6 or 8
ds_wait(0); // wait for timer to return
longtime = t2 - sgpr64; // most insns are 4 clocks, others are 6 or 8
#ifdef USE_GWS_BARRIER
gws_barrier(WAVE_BAR_0); // global wave barrier, everyone wait here!
#endif
j = real_function(a,0); // real stack function call
out[gid] = yy; // write something before leaving
}
int real_function(int a,int b) // a non-inline function, can be reentrant
{
int tmp; // local variables are temporary registers
tmp = a * b; // do something
return tmp; // return
}
#endif
//------------------------------------------------------------------------------------------
// Alternatively, programs are written using GCN ISA code
// Option 1, first compile a shell program using the OCL compiler and paste here
// Option 2, compile a shell program using the GCNC compiler and paste here
// Option 3, Write your own ISA program, requires in depth knowlege opencl internals
//------------------------------------------------------------------------------------------
#ifdef _MIXCODE_GCN
x_set_nvgpr 63
x_set_nsgpr 40
x_set_ldsmax 0x7fff
s_mov_b32 m0, 0x00008000
s_buffer_load_dwordx2 s[0:1], s[4:7], 0x04
s_waitcnt lgkmcnt(0)
s_mul_i32 s0, s12, s0
s_mul_i32 s1, s13, s1
v_add_i32 v0, vcc, s0, v0
v_add_i32 v1, vcc, s1, v1
s_load_dwordx4 s[20:23], s[2:3], 0x50
s_load_dwordx4 s[16:19], s[2:3], 0x58
s_buffer_load_dword s24, s[8:11], 0x00
s_buffer_load_dword s25, s[8:11], 0x04
s_buffer_load_dword s26, s[8:11], 0x08
s_buffer_load_dword s27, s[8:11], 0x0c
s_buffer_load_dword s28, s[8:11], 0x10
s_waitcnt lgkmcnt(0)
v_mov_b32 v5, v0
v_mov_b32 v6, v1
v_lshlrev_b32 v11, 8, v6
v_add_i32 v7, vcc, v5, v11
v_mov_b32 v11, 0x3333
v_mov_b32 v2, 0
s_waitcnt lgkmcnt(0)
v_lshlrev_b32 v11, 2, v7
v_add_i32 v11, vcc, s25, v11
s_waitcnt lgkmcnt(0)
v_mov_b32 v0,1.111
v_mov_b32 v1,7.0
v_mov_b32 v9,10.0
v_mac_f32 v9,v1,2.222
tb_store_fmt_x v9, v11, s[20:23], 0 offen format:TF_X
label_1:
s_endpgm
end
.end lcctest
#endif //_MIXCODE_GCN
//------------------------------------------------------------------------------------------
// Can can also be written in AMD IL code, these are processor specific
// Code destined to Tahiti
//------------------------------------------------------------------------------------------
#ifdef _MIXCODE_TAHITI
mov r66, r1021.xyz0
mov r65.__z_, r66.00y0
mov r65.___w, l11
ishl r65.__z_, r65.z, r65.w
mov r65.___w, r66.000x
iadd r65.__z_, r65.z, r65.w
mov r65.___w, l12
ishl r65.__z_, r65.z, r65.w
iadd r65.x___, r1.y, r65.z
iadd r65._y__, r1.x, r65.z
mov r1010.x___, r65.y
uav_raw_load_id(10)_cached r1011.x___, r1010.x
mov r65._y__, r1011.x
mov r1011, r65.y
mov r1010.x___, r65.x
uav_raw_store_id(11) mem0.x___, r1010.x, r1011.x
ret_dyn
#endif //_MIXCODE_TAHITI
//------------------------------------------------------------------------
// Can can also be written in AMD IL code, these are processor specific
// Code destined to Cayman
//------------------------------------------------------------------------
#ifdef _MIXCODE_CAYMAN
dcl_literal l88, 0x00005555, 0x00ff0000, 16, 0x0001FBC9; f32:i32 129993
mov r66, r1021.xyz0
mov r65._y__, r66.0y00
mov r65.__z_, l11
ishl r65._y__, r65.y, r65.z
mov r65.__z_, r66.00x0
iadd r65._y__, r65.y, r65.z
mov r65.__z_, l12
ishl r65._y__, r65.y, r65.z
iadd r65.x___, r1.y, r65.y
mov r1011.y, Tmr.y
mov r1011.x, Tmr.x
mov r1010.x, r65.x
uav_raw_store_id(11) mem0.xy__, r1010.x, r1011.xyxy
ret
#endif //_MIXCODE_CAYMAN
//------------------------------------------------------------------------------------------
|