提交 abbba6cf authored 作者: Frederic Bastien's avatar Frederic Bastien

removed the prefix cnda_ as this cause a bug with OutputGuard on the gpu.

上级 f6d0fcef
......@@ -250,34 +250,34 @@ class GpuDimShuffle(Op):
#check input
print >> sio, """
if (cnda_%(input)s->nd != %(nd_in)s)
if (%(input)s->nd != %(nd_in)s)
{
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", cnda_%(input)s->nd);
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(input)s->nd);
%(fail)s;
}
""" %locals()
#alloc an output
print >> sio, """
if (cnda_%(res)s && (cnda_%(res)s->nd == %(nd_out)s))
if (%(res)s && (%(res)s->nd == %(nd_out)s))
{
//re-use previously-allocated cnda
}
else
{
if (cnda_%(res)s)
if (%(res)s)
{
if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
if (CudaNdarray_set_nd(%(res)s, %(nd_out)s))
{
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
Py_DECREF(%(res)s);
%(res)s = NULL;
%(fail)s;
}
}
else
{
cnda_%(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s);
if (NULL == cnda_%(res)s)
%(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s);
if (NULL == %(res)s)
{
%(fail)s;
}
......@@ -286,11 +286,11 @@ class GpuDimShuffle(Op):
""" %locals()
print >> sio, """
if (CudaNdarray_set_device_data(cnda_%(res)s, CudaNdarray_DEV_DATA(cnda_%(input)s), cnda_%(input)s))
if (CudaNdarray_set_device_data(%(res)s, CudaNdarray_DEV_DATA(%(input)s), %(input)s))
{
// err message set
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
Py_DECREF(%(res)s);
%(res)s = NULL;
%(fail)s;
}
""" %locals()
......@@ -303,28 +303,28 @@ class GpuDimShuffle(Op):
# that the size in this dimension is 1
assert node.outputs[0].type.broadcastable[i]
print >> sio, """
CudaNdarray_set_dim(cnda_%(res)s, %(i)s, 1);
CudaNdarray_set_stride(cnda_%(res)s, %(i)s, 0);
CudaNdarray_set_dim(%(res)s, %(i)s, 1);
CudaNdarray_set_stride(%(res)s, %(i)s, 0);
""" %locals()
else:
print >> sio, """
CudaNdarray_set_dim(cnda_%(res)s, %(i)s, CudaNdarray_HOST_DIMS(cnda_%(input)s)[%(o)s]);
CudaNdarray_set_stride(cnda_%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(cnda_%(input)s)[%(o)s]);
CudaNdarray_set_dim(%(res)s, %(i)s, CudaNdarray_HOST_DIMS(%(input)s)[%(o)s]);
CudaNdarray_set_stride(%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(%(input)s)[%(o)s]);
""" %locals()
for i, o in enumerate(self.new_order):
print >> sio, """
//std::cerr << "GpuDimShuffle " << cnda_%(res)s << " str[%(i)s] = " << cnda_%(res)s->str[%(i)s] << "\\n";
//std::cerr << "GpuDimShuffle " << %(res)s << " str[%(i)s] = " << %(res)s->str[%(i)s] << "\\n";
""" %locals()
# copy the host dims and stride -> device
if 0:
print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
if (CudaNdarray_copy_structure_to_device(%(res)s))
{
//err msg set
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
Py_DECREF(%(res)s);
%(res)s = NULL;
%(fail)s;
}
""" %locals()
......@@ -405,9 +405,9 @@ class GpuSum(Op):
#check input
print >> sio, """
if (cnda_%(x)s->nd != %(nd_in)s)
if (%(x)s->nd != %(nd_in)s)
{
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", cnda_%(x)s->nd);
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd);
%(fail)s;
}
""" %locals()
......@@ -418,15 +418,15 @@ class GpuSum(Op):
# check the basics of out output
print >> sio, """
if ( !cnda_%(z)s
|| (cnda_%(z)s->nd != %(nd_out)s)
if ( !%(z)s
|| (%(z)s->nd != %(nd_out)s)
""" % locals()
#ensure that the output has the right non-reduced dimensions
j = 0
for i in xrange(nd_in):
if not self.reduce_mask[i]:
print >> sio, " || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[%(j)s] !=CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s]) " % locals()
print >> sio, " || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !=CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " % locals()
j += 1
print >> sio, """
......@@ -438,13 +438,13 @@ class GpuSum(Op):
j = 0
for i in xrange(nd_in):
if not self.reduce_mask[i]:
print >> sio, 'new_dims[%(j)s] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s];' % locals()
print >> sio, 'new_dims[%(j)s] = CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];' % locals()
j += 1
print >> sio, """
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
if (NULL == cnda_%(z)s)
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
if (NULL == %(z)s)
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s;
......@@ -469,13 +469,13 @@ class GpuSum(Op):
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]
);
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
......@@ -495,21 +495,21 @@ class GpuSum(Op):
""" %locals()
for i in xrange(ndim):
print >> sio, """
CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s],
CudaNdarray_HOST_DIMS(%(x)s)[%(i)s],
""" %locals()
print >> sio, """
CudaNdarray_DEV_DATA(cnda_%(x)s)
CudaNdarray_DEV_DATA(%(x)s)
""" %locals()
for i in xrange(ndim):
print >> sio, """
,CudaNdarray_HOST_STRIDES(cnda_%(x)s)[%(i)s]
,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]
""" %locals()
print >> sio, """
,CudaNdarray_DEV_DATA(cnda_%(z)s)
,CudaNdarray_DEV_DATA(%(z)s)
""" %locals()
for i in xrange(nd_out):
print >> sio, """
,CudaNdarray_HOST_STRIDES(cnda_%(z)s)[%(i)s]
,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" %locals()
print >> sio, """
);
......@@ -626,16 +626,16 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
......@@ -658,24 +658,24 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0];
dim3 n_blocks(1);
if (verbose) fprintf(stdout, "running kernel_reduce_sum_11_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s);
if (verbose) fprint_CudaNdarray(stdout, %(x)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
......@@ -699,19 +699,19 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
......@@ -740,12 +740,12 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
{
if (n_blocks.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
break;
n_blocks.y += 1;
}
......@@ -760,17 +760,17 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
break;
n_threads.y += 1;
}
n_threads.y -= 1;
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]);
%(makecall)s
}
""" % locals()
......@@ -781,14 +781,14 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
{
if (n_blocks.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])
if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break;
n_blocks.y += 1;
}
......@@ -802,13 +802,13 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break;
n_threads.y += 1;
}
......@@ -817,7 +817,7 @@ class GpuSum(Op):
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{
if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
break;
n_threads.z += 1;
}
......@@ -833,39 +833,39 @@ class GpuSum(Op):
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2];
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
n_threads.z -= 1;
if (n_threads.z > 64)
n_threads.z = 64;
if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
n_threads.z = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s);
if (verbose) fprint_CudaNdarray(stdout, cnda_%(z)s);
if (verbose) fprint_CudaNdarray(stdout, %(x)s);
if (verbose) fprint_CudaNdarray(stdout, %(z)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]);
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
......
......@@ -29,42 +29,42 @@ class GpuDot22(Op):
z, = outputs
fail = sub['fail']
return """
if (cnda_%(x)s->nd != 2)
if (%(x)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", cnda_%(x)s->nd);
PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", %(x)s->nd);
%(fail)s;
}
if (cnda_%(y)s->nd != 2)
if (%(y)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", cnda_%(y)s->nd);
PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", %(y)s->nd);
%(fail)s;
}
if ((NULL == cnda_%(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]))
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1]))
{
//if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
Py_XDECREF(cnda_%(z)s);
//if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1];
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims))
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
{
if (cnda_%(z)s)
if (%(z)s)
{
Py_DECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
Py_DECREF(%(z)s);
%(z)s = NULL;
}
%(fail)s;
}
}
if (CudaNdarray_gemm(1.0f, cnda_%(x)s, cnda_%(y)s, 0.0f, cnda_%(z)s))
if (CudaNdarray_gemm(1.0f, %(x)s, %(y)s, 0.0f, %(z)s))
{
if (cnda_%(z)s)
if (%(z)s)
{
Py_DECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
Py_DECREF(%(z)s);
%(z)s = NULL;
}
%(fail)s;
}
......@@ -105,12 +105,12 @@ class GpuGemm(Op):
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
if (CudaNdarray_gemm(%(name)s_a, cnda_%(x)s, cnda_%(y)s, %(name)s_b, cnda_%(z_in)s))
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_in)s))
{
%(fail)s;
}
cnda_%(z_out)s = cnda_%(z_in)s;
Py_INCREF(cnda_%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
""" % locals()
gpu_gemm = GpuGemm()
......@@ -222,15 +222,15 @@ class GpuDownsampleFactorMax(Op):
ignore_border = int(self.ignore_border)
return """
int dims[4], xdim2, xdim3;
if (cnda_%(x)s->nd != 4)
if (%(x)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
xdim2 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2];
xdim3 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[3];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[1];
xdim2 = CudaNdarray_HOST_DIMS(%(x)s)[2];
xdim3 = CudaNdarray_HOST_DIMS(%(x)s)[3];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(x)s)[1];
dims[2] = xdim2 / %(ds0)s;
dims[3] = xdim3 / %(ds1)s;
if (! %(ignore_border)s)
......@@ -243,19 +243,19 @@ class GpuDownsampleFactorMax(Op):
%(fail)s;
}
if ((NULL == cnda_%(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != dims[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != dims[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[2] != dims[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[3] != dims[3]))
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != dims[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != dims[1])
|| (CudaNdarray_HOST_DIMS(%(z)s)[2] != dims[2])
|| (CudaNdarray_HOST_DIMS(%(z)s)[3] != dims[3]))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s)
|| CudaNdarray_alloc_contiguous(cnda_%(z)s, 4, dims))
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 4, dims))
{
Py_XDECREF(cnda_%(z)s);
cnda_%(z)s = NULL;
Py_XDECREF(%(z)s);
%(z)s = NULL;
PyErr_SetString(PyExc_ValueError, "Was not able to allocate output!");
%(fail)s;
}
......@@ -268,12 +268,12 @@ class GpuDownsampleFactorMax(Op):
if ((grid.x*grid.y) && dims[3])
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -372,57 +372,57 @@ class GpuDownsampleFactorMaxGrad(Op):
ds0, ds1 = self.ds
ignore_border = int(self.ignore_border)
return """
if (cnda_%(x)s->nd != 4
|| cnda_%(z)s->nd != 4
|| cnda_%(gz)s->nd != 4)
if (%(x)s->nd != 4
|| %(z)s->nd != 4
|| %(gz)s->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == cnda_%(gx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[2] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[3] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]))
if ((NULL == %(gx)s)
|| (CudaNdarray_HOST_DIMS(%(gx)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[2] != CudaNdarray_HOST_DIMS(%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[3] != CudaNdarray_HOST_DIMS(%(x)s)[3]))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(gx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(gx)s, 4, CudaNdarray_HOST_DIMS(cnda_%(x)s)))
Py_XDECREF(%(gx)s);
%(gx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(gx)s)
|| CudaNdarray_alloc_contiguous(%(gx)s, 4, CudaNdarray_HOST_DIMS(%(x)s)))
{
Py_XDECREF(cnda_%(gx)s);
cnda_%(gx)s = NULL;
Py_XDECREF(%(gx)s);
%(gx)s = NULL;
%(fail)s;
}
}
{
//TODO: implement this by supporting more
//outputs than threads
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]);
dim3 grid(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[3],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gz)s),
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gx)s));
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[2],
CudaNdarray_HOST_DIMS(%(z)s)[3],
CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_HOST_STRIDES(%(z)s)[2],
CudaNdarray_HOST_STRIDES(%(z)s)[3],
CudaNdarray_DEV_DATA(%(gz)s),
CudaNdarray_HOST_STRIDES(%(gz)s)[0],
CudaNdarray_HOST_STRIDES(%(gz)s)[1],
CudaNdarray_HOST_STRIDES(%(gz)s)[2],
CudaNdarray_HOST_STRIDES(%(gz)s)[3],
CudaNdarray_DEV_DATA(%(gx)s));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......
......@@ -885,20 +885,20 @@ class NaiveAlgo(object):
for iname in inputs:
print >> sio, """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != cnda_%(iname)s->nd)
if (%(nd)s != %(iname)s->nd)
{
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);
%(fail)s;
}
for (int i = 0; i< %(nd)s; ++i)
{
dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] : dims[i];
if ((CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i]))
dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
if ((CudaNdarray_HOST_DIMS(%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
{
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
i,
CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i],
CudaNdarray_HOST_DIMS(%(iname)s)[i],
dims[i]
);
%(fail)s;
......@@ -909,31 +909,31 @@ class NaiveAlgo(object):
#check that all outputs have valid dimensions
for oname in outputs:
print >> sio, """
for (int i = 0; (i< %(nd)s) && (cnda_%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(oname)s)[i])
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{
Py_DECREF(cnda_%(oname)s);
cnda_%(oname)s = NULL;
Py_DECREF(%(oname)s);
%(oname)s = NULL;
}
}
if (NULL == cnda_%(oname)s)
if (NULL == %(oname)s)
{
cnda_%(oname)s = (CudaNdarray*)CudaNdarray_new_null();
if (!cnda_%(oname)s)
%(oname)s = (CudaNdarray*)CudaNdarray_new_null();
if (!%(oname)s)
{
//error string already set
%(fail)s;
}
if (CudaNdarray_alloc_contiguous(cnda_%(oname)s, %(nd)s, dims))
if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
{
//error string already set
Py_DECREF(cnda_%(oname)s);
cnda_%(oname)s = NULL;
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals()
print >> sio, """
{
......@@ -943,11 +943,11 @@ class NaiveAlgo(object):
""" % locals()
for iname in inputs:
print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_HOST_STRIDES(cnda_%(iname)s)
, CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
""" % locals()
for oname in outputs:
print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_HOST_STRIDES(cnda_%(oname)s)
, CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
""" % locals()
print >> sio, """
))
......@@ -956,8 +956,8 @@ class NaiveAlgo(object):
"""
for oname in outputs:
print >> sio, """
Py_DECREF(cnda_%(oname)s);
cnda_%(oname)s = NULL;
Py_DECREF(%(oname)s);
%(oname)s = NULL;
""" % locals()
print >> sio, """
%(fail)s;
......
......@@ -81,60 +81,60 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
if (cnda_%(y_idx)s->nd != 1)
if (%(y_idx)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (cnda_%(x)s->nd != 2)
if (%(x)s->nd != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (cnda_%(b)s->nd != 1)
if (%(b)s->nd != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])
if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(b)s)[0])
if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == cnda_%(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(cnda_%(nll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
if ((NULL == %(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(%(nll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(nll)s);
cnda_%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(nll)s)
Py_XDECREF(%(nll)s);
%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!%(nll)s)
{
%(fail)s;
}
}
if ((NULL == cnda_%(sm)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]))
if ((NULL == %(sm)s)
|| (CudaNdarray_HOST_DIMS(%(sm)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(sm)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(cnda_%(sm)s);
cnda_%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(cnda_%(x)s));
if(!cnda_%(sm)s)
Py_XDECREF(%(sm)s);
%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s));
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should pick it up.
%(fail)s;
}
}
if ((NULL == cnda_%(am)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(am)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]))
if ((NULL == %(am)s)
|| (CudaNdarray_HOST_DIMS(%(am)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(cnda_%(am)s);
cnda_%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s));
if(!cnda_%(am)s)
Py_XDECREF(%(am)s);
%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
// no need to decref nll amd sm, the cleanup code should pick it up.
......@@ -142,19 +142,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
}
}
{
int n_blocks = CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0];
int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
int n_shared_bytes = 0; //n_threads * sizeof(float);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_HOST_STRIDES(cnda_%(b)s)[0],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(nll)s), CudaNdarray_HOST_STRIDES(cnda_%(nll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(am)s), CudaNdarray_HOST_STRIDES(cnda_%(am)s)[0]);
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
......@@ -192,58 +192,58 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
fail = sub['fail']
return """
if ((cnda_%(dnll)s->nd != 1)
|| (cnda_%(sm)s->nd != 2)
|| (cnda_%(y_idx)s->nd != 1))
if ((%(dnll)s->nd != 1)
|| (%(sm)s->nd != 2)
|| (%(y_idx)s->nd != 1))
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0])
if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
{
PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0],CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0]);
CudaNdarray_HOST_DIMS(%(dnll)s)[0],CudaNdarray_HOST_DIMS(%(sm)s)[0]);
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])
if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if ((NULL == cnda_%(dx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1]))
if ((NULL == %(dx)s)
|| (CudaNdarray_HOST_DIMS(%(dx)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
|| (CudaNdarray_HOST_DIMS(%(dx)s)[1] != CudaNdarray_HOST_DIMS(%(sm)s)[1]))
{
Py_XDECREF(cnda_%(dx)s);
cnda_%(dx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(dx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(dx)s, 2, CudaNdarray_HOST_DIMS(cnda_%(sm)s)))
Py_XDECREF(%(dx)s);
%(dx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == %(dx)s)
|| CudaNdarray_alloc_contiguous(%(dx)s, 2, CudaNdarray_HOST_DIMS(%(sm)s)))
{
Py_XDECREF(cnda_%(dx)s);
cnda_%(dx)s = NULL;
Py_XDECREF(%(dx)s);
%(dx)s = NULL;
%(fail)s;
}
}
{
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1]
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1]
>>>(
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1],
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_DEV_DATA(cnda_%(dnll)s),
CudaNdarray_HOST_STRIDES(cnda_%(dnll)s)[0],
CudaNdarray_DEV_DATA(%(dnll)s),
CudaNdarray_HOST_STRIDES(%(dnll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s),
CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1],
CudaNdarray_DEV_DATA(%(sm)s),
CudaNdarray_HOST_STRIDES(%(sm)s)[0],
CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s),
CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(dx)s) //guaranteed c-contiguous
CudaNdarray_DEV_DATA(%(dx)s) //guaranteed c-contiguous
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
......@@ -253,7 +253,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
%(fail)s;
}
}
assert(cnda_%(dx)s);
assert(%(dx)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
......
......@@ -132,10 +132,10 @@ class CudaNdarrayType(Type):
def c_declare(self, name, sub):
ndim = self.ndim
c_typename = self.dtype_specs()[1]
return """ CudaNdarray * cnda_%(name)s;""" %locals()
return """ CudaNdarray * %(name)s;""" %locals()
def c_init(self, name, sub):
return "cnda_%(name)s = NULL;" % locals()
return "%(name)s = NULL;" % locals()
def c_extract(self, name, sub):
sio = StringIO.StringIO()
......@@ -148,61 +148,61 @@ class CudaNdarrayType(Type):
if (CudaNdarray_Check(py_%(name)s))
{
//fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
cnda_%(name)s = (CudaNdarray*)py_%(name)s;
//std::cerr << "c_extract " << cnda_%(name)s << '\\n';
if (cnda_%(name)s->nd != %(nd)s)
%(name)s = (CudaNdarray*)py_%(name)s;
//std::cerr << "c_extract " << %(name)s << '\\n';
if (%(name)s->nd != %(nd)s)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", cnda_%(name)s->nd);
cnda_%(name)s = NULL;
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", %(name)s->nd);
%(name)s = NULL;
%(fail)s;
}
//std::cerr << "c_extract " << cnda_%(name)s << " nd check passed\\n";
//std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
""" %locals()
for i, b in enumerate(self.broadcastable):
if b:
print >> sio, """
if (CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s] != 1)
if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL;
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
%(name)s = NULL;
%(fail)s;
}
//std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
//std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
//std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
if (CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s])
//std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
//std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
//std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
{
//std::cerr << "c_extract bad stride detected...\\n";
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL;
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
%(name)s = NULL;
%(fail)s;
}
//std::cerr << "c_extract " << cnda_%(name)s << "bcast check %(i)s passed\\n";
//std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
""" %locals()
print >> sio, """
assert(cnda_%(name)s);
assert(%(name)s);
Py_INCREF(py_%(name)s);
}
else
{
//fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
cnda_%(name)s = NULL;
%(name)s = NULL;
%(fail)s;
}
//std::cerr << "c_extract done " << cnda_%(name)s << '\\n';
//std::cerr << "c_extract done " << %(name)s << '\\n';
""" % locals()
#print sio.getvalue()
return sio.getvalue()
def c_cleanup(self, name, sub):
return """
//std::cerr << "cleanup " << py_%(name)s << " " << cnda_%(name)s << "\\n";
//std::cerr << "cleanup " << py_%(name)s << " " << %(name)s << "\\n";
//fprintf(stderr, "c_cleanup CNDA py_object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
if (cnda_%(name)s)
if (%(name)s)
{
//fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %%p %%i\\n", cnda_%(name)s, (cnda_%(name)s->ob_refcnt));
Py_XDECREF(cnda_%(name)s);
//fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %%p %%i\\n", %(name)s, (%(name)s->ob_refcnt));
Py_XDECREF(%(name)s);
}
//std::cerr << "cleanup done" << py_%(name)s << "\\n";
""" % locals()
......@@ -211,7 +211,7 @@ class CudaNdarrayType(Type):
"""Override `CLinkerOp.c_sync` """
return """
//std::cerr << "sync\\n";
if (NULL == cnda_%(name)s) {
if (NULL == %(name)s) {
// failure: sync None to storage
Py_XDECREF(py_%(name)s);
py_%(name)s = Py_None;
......@@ -219,10 +219,10 @@ class CudaNdarrayType(Type):
}
else
{
if (py_%(name)s != (PyObject*)cnda_%(name)s)
if (py_%(name)s != (PyObject*)%(name)s)
{
Py_XDECREF(py_%(name)s);
py_%(name)s = (PyObject*)cnda_%(name)s;
py_%(name)s = (PyObject*)%(name)s;
Py_INCREF(py_%(name)s);
}
assert(py_%(name)s->ob_refcnt);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论