提交 abbba6cf authored 作者: Frederic Bastien's avatar Frederic Bastien

removed the prefix cnda_ as this cause a bug with OutputGuard on the gpu.

上级 f6d0fcef
...@@ -250,34 +250,34 @@ class GpuDimShuffle(Op): ...@@ -250,34 +250,34 @@ class GpuDimShuffle(Op):
#check input #check input
print >> sio, """ print >> sio, """
if (cnda_%(input)s->nd != %(nd_in)s) if (%(input)s->nd != %(nd_in)s)
{ {
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", cnda_%(input)s->nd); PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(input)s->nd);
%(fail)s; %(fail)s;
} }
""" %locals() """ %locals()
#alloc an output #alloc an output
print >> sio, """ print >> sio, """
if (cnda_%(res)s && (cnda_%(res)s->nd == %(nd_out)s)) if (%(res)s && (%(res)s->nd == %(nd_out)s))
{ {
//re-use previously-allocated cnda //re-use previously-allocated cnda
} }
else else
{ {
if (cnda_%(res)s) if (%(res)s)
{ {
if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s)) if (CudaNdarray_set_nd(%(res)s, %(nd_out)s))
{ {
Py_DECREF(cnda_%(res)s); Py_DECREF(%(res)s);
cnda_%(res)s = NULL; %(res)s = NULL;
%(fail)s; %(fail)s;
} }
} }
else else
{ {
cnda_%(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s); %(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s);
if (NULL == cnda_%(res)s) if (NULL == %(res)s)
{ {
%(fail)s; %(fail)s;
} }
...@@ -286,11 +286,11 @@ class GpuDimShuffle(Op): ...@@ -286,11 +286,11 @@ class GpuDimShuffle(Op):
""" %locals() """ %locals()
print >> sio, """ print >> sio, """
if (CudaNdarray_set_device_data(cnda_%(res)s, CudaNdarray_DEV_DATA(cnda_%(input)s), cnda_%(input)s)) if (CudaNdarray_set_device_data(%(res)s, CudaNdarray_DEV_DATA(%(input)s), %(input)s))
{ {
// err message set // err message set
Py_DECREF(cnda_%(res)s); Py_DECREF(%(res)s);
cnda_%(res)s = NULL; %(res)s = NULL;
%(fail)s; %(fail)s;
} }
""" %locals() """ %locals()
...@@ -303,28 +303,28 @@ class GpuDimShuffle(Op): ...@@ -303,28 +303,28 @@ class GpuDimShuffle(Op):
# that the size in this dimension is 1 # that the size in this dimension is 1
assert node.outputs[0].type.broadcastable[i] assert node.outputs[0].type.broadcastable[i]
print >> sio, """ print >> sio, """
CudaNdarray_set_dim(cnda_%(res)s, %(i)s, 1); CudaNdarray_set_dim(%(res)s, %(i)s, 1);
CudaNdarray_set_stride(cnda_%(res)s, %(i)s, 0); CudaNdarray_set_stride(%(res)s, %(i)s, 0);
""" %locals() """ %locals()
else: else:
print >> sio, """ print >> sio, """
CudaNdarray_set_dim(cnda_%(res)s, %(i)s, CudaNdarray_HOST_DIMS(cnda_%(input)s)[%(o)s]); CudaNdarray_set_dim(%(res)s, %(i)s, CudaNdarray_HOST_DIMS(%(input)s)[%(o)s]);
CudaNdarray_set_stride(cnda_%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(cnda_%(input)s)[%(o)s]); CudaNdarray_set_stride(%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(%(input)s)[%(o)s]);
""" %locals() """ %locals()
for i, o in enumerate(self.new_order): for i, o in enumerate(self.new_order):
print >> sio, """ print >> sio, """
//std::cerr << "GpuDimShuffle " << cnda_%(res)s << " str[%(i)s] = " << cnda_%(res)s->str[%(i)s] << "\\n"; //std::cerr << "GpuDimShuffle " << %(res)s << " str[%(i)s] = " << %(res)s->str[%(i)s] << "\\n";
""" %locals() """ %locals()
# copy the host dims and stride -> device # copy the host dims and stride -> device
if 0: if 0:
print >> sio, """ print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s)) if (CudaNdarray_copy_structure_to_device(%(res)s))
{ {
//err msg set //err msg set
Py_DECREF(cnda_%(res)s); Py_DECREF(%(res)s);
cnda_%(res)s = NULL; %(res)s = NULL;
%(fail)s; %(fail)s;
} }
""" %locals() """ %locals()
...@@ -405,9 +405,9 @@ class GpuSum(Op): ...@@ -405,9 +405,9 @@ class GpuSum(Op):
#check input #check input
print >> sio, """ print >> sio, """
if (cnda_%(x)s->nd != %(nd_in)s) if (%(x)s->nd != %(nd_in)s)
{ {
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", cnda_%(x)s->nd); PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd);
%(fail)s; %(fail)s;
} }
""" %locals() """ %locals()
...@@ -418,15 +418,15 @@ class GpuSum(Op): ...@@ -418,15 +418,15 @@ class GpuSum(Op):
# check the basics of out output # check the basics of out output
print >> sio, """ print >> sio, """
if ( !cnda_%(z)s if ( !%(z)s
|| (cnda_%(z)s->nd != %(nd_out)s) || (%(z)s->nd != %(nd_out)s)
""" % locals() """ % locals()
#ensure that the output has the right non-reduced dimensions #ensure that the output has the right non-reduced dimensions
j = 0 j = 0
for i in xrange(nd_in): for i in xrange(nd_in):
if not self.reduce_mask[i]: if not self.reduce_mask[i]:
print >> sio, " || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[%(j)s] !=CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s]) " % locals() print >> sio, " || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !=CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " % locals()
j += 1 j += 1
print >> sio, """ print >> sio, """
...@@ -438,13 +438,13 @@ class GpuSum(Op): ...@@ -438,13 +438,13 @@ class GpuSum(Op):
j = 0 j = 0
for i in xrange(nd_in): for i in xrange(nd_in):
if not self.reduce_mask[i]: if not self.reduce_mask[i]:
print >> sio, 'new_dims[%(j)s] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s];' % locals() print >> sio, 'new_dims[%(j)s] = CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];' % locals()
j += 1 j += 1
print >> sio, """ print >> sio, """
Py_XDECREF(cnda_%(z)s); Py_XDECREF(%(z)s);
cnda_%(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims); %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
if (NULL == cnda_%(z)s) if (NULL == %(z)s)
{ {
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
%(fail)s; %(fail)s;
...@@ -469,13 +469,13 @@ class GpuSum(Op): ...@@ -469,13 +469,13 @@ class GpuSum(Op):
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0] CudaNdarray_HOST_STRIDES(%(z)s)[0]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError()) if (cudaSuccess != cudaGetLastError())
...@@ -495,21 +495,21 @@ class GpuSum(Op): ...@@ -495,21 +495,21 @@ class GpuSum(Op):
""" %locals() """ %locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
CudaNdarray_HOST_DIMS(cnda_%(x)s)[%(i)s], CudaNdarray_HOST_DIMS(%(x)s)[%(i)s],
""" %locals() """ %locals()
print >> sio, """ print >> sio, """
CudaNdarray_DEV_DATA(cnda_%(x)s) CudaNdarray_DEV_DATA(%(x)s)
""" %locals() """ %locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
,CudaNdarray_HOST_STRIDES(cnda_%(x)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]
""" %locals() """ %locals()
print >> sio, """ print >> sio, """
,CudaNdarray_DEV_DATA(cnda_%(z)s) ,CudaNdarray_DEV_DATA(%(z)s)
""" %locals() """ %locals()
for i in xrange(nd_out): for i in xrange(nd_out):
print >> sio, """ print >> sio, """
,CudaNdarray_HOST_STRIDES(cnda_%(z)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" %locals() """ %locals()
print >> sio, """ print >> sio, """
); );
...@@ -626,16 +626,16 @@ class GpuSum(Op): ...@@ -626,16 +626,16 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1); dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(z)s)); CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
...@@ -658,24 +658,24 @@ class GpuSum(Op): ...@@ -658,24 +658,24 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; while (n_threads.y * n_threads.x <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1; n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0];
dim3 n_blocks(1); dim3 n_blocks(1);
if (verbose) fprintf(stdout, "running kernel_reduce_sum_11_%(name)s\\n"); if (verbose) fprintf(stdout, "running kernel_reduce_sum_11_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s); if (verbose) fprint_CudaNdarray(stdout, %(x)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s)); CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
...@@ -699,19 +699,19 @@ class GpuSum(Op): ...@@ -699,19 +699,19 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0] CudaNdarray_HOST_STRIDES(%(z)s)[0]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
...@@ -740,12 +740,12 @@ class GpuSum(Op): ...@@ -740,12 +740,12 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
{ {
if (n_blocks.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]) if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
break; break;
n_blocks.y += 1; n_blocks.y += 1;
} }
...@@ -760,17 +760,17 @@ class GpuSum(Op): ...@@ -760,17 +760,17 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{ {
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
break; break;
n_threads.y += 1; n_threads.y += 1;
} }
n_threads.y -= 1; n_threads.y -= 1;
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]);
%(makecall)s %(makecall)s
} }
""" % locals() """ % locals()
...@@ -781,14 +781,14 @@ class GpuSum(Op): ...@@ -781,14 +781,14 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2], std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks( dim3 n_blocks(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS)); NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
{ {
if (n_blocks.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]) if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break; break;
n_blocks.y += 1; n_blocks.y += 1;
} }
...@@ -802,13 +802,13 @@ class GpuSum(Op): ...@@ -802,13 +802,13 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[2], std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{ {
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]) if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
break; break;
n_threads.y += 1; n_threads.y += 1;
} }
...@@ -817,7 +817,7 @@ class GpuSum(Op): ...@@ -817,7 +817,7 @@ class GpuSum(Op):
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK) while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
{ {
if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
break; break;
n_threads.z += 1; n_threads.z += 1;
} }
...@@ -833,39 +833,39 @@ class GpuSum(Op): ...@@ -833,39 +833,39 @@ class GpuSum(Op):
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1; n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]) if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]; n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z; while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
n_threads.z -= 1; n_threads.z -= 1;
if (n_threads.z > 64) if (n_threads.z > 64)
n_threads.z = 64; n_threads.z = 64;
if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
n_threads.z = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s); if (verbose) fprint_CudaNdarray(stdout, %(x)s);
if (verbose) fprint_CudaNdarray(stdout, cnda_%(z)s); if (verbose) fprint_CudaNdarray(stdout, %(z)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2], CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3], CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]); CudaNdarray_HOST_STRIDES(%(z)s)[0]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
......
...@@ -29,42 +29,42 @@ class GpuDot22(Op): ...@@ -29,42 +29,42 @@ class GpuDot22(Op):
z, = outputs z, = outputs
fail = sub['fail'] fail = sub['fail']
return """ return """
if (cnda_%(x)s->nd != 2) if (%(x)s->nd != 2)
{ {
PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", cnda_%(x)s->nd); PyErr_Format(PyExc_TypeError, "rank(x)==%%i must be 2", %(x)s->nd);
%(fail)s; %(fail)s;
} }
if (cnda_%(y)s->nd != 2) if (%(y)s->nd != 2)
{ {
PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", cnda_%(y)s->nd); PyErr_Format(PyExc_TypeError, "rank(y)==%%i must be 2", %(y)s->nd);
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1])) || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1]))
{ {
//if (cnda_%(z)s) Py_DECREF(cnda_%(z)s); //if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(cnda_%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]; dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null(); %(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims)) if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
{ {
if (cnda_%(z)s) if (%(z)s)
{ {
Py_DECREF(cnda_%(z)s); Py_DECREF(%(z)s);
cnda_%(z)s = NULL; %(z)s = NULL;
} }
%(fail)s; %(fail)s;
} }
} }
if (CudaNdarray_gemm(1.0f, cnda_%(x)s, cnda_%(y)s, 0.0f, cnda_%(z)s)) if (CudaNdarray_gemm(1.0f, %(x)s, %(y)s, 0.0f, %(z)s))
{ {
if (cnda_%(z)s) if (%(z)s)
{ {
Py_DECREF(cnda_%(z)s); Py_DECREF(%(z)s);
cnda_%(z)s = NULL; %(z)s = NULL;
} }
%(fail)s; %(fail)s;
} }
...@@ -105,12 +105,12 @@ class GpuGemm(Op): ...@@ -105,12 +105,12 @@ class GpuGemm(Op):
: (REAL)(((double*)%(b)s->data)[0]); : (REAL)(((double*)%(b)s->data)[0]);
#undef REAL #undef REAL
if (CudaNdarray_gemm(%(name)s_a, cnda_%(x)s, cnda_%(y)s, %(name)s_b, cnda_%(z_in)s)) if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_in)s))
{ {
%(fail)s; %(fail)s;
} }
cnda_%(z_out)s = cnda_%(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(cnda_%(z_out)s); Py_INCREF(%(z_out)s);
""" % locals() """ % locals()
gpu_gemm = GpuGemm() gpu_gemm = GpuGemm()
...@@ -222,15 +222,15 @@ class GpuDownsampleFactorMax(Op): ...@@ -222,15 +222,15 @@ class GpuDownsampleFactorMax(Op):
ignore_border = int(self.ignore_border) ignore_border = int(self.ignore_border)
return """ return """
int dims[4], xdim2, xdim3; int dims[4], xdim2, xdim3;
if (cnda_%(x)s->nd != 4) if (%(x)s->nd != 4)
{ {
PyErr_SetString(PyExc_ValueError, "rank error"); PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s; %(fail)s;
} }
xdim2 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]; xdim2 = CudaNdarray_HOST_DIMS(%(x)s)[2];
xdim3 = CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]; xdim3 = CudaNdarray_HOST_DIMS(%(x)s)[3];
dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]; dims[1] = CudaNdarray_HOST_DIMS(%(x)s)[1];
dims[2] = xdim2 / %(ds0)s; dims[2] = xdim2 / %(ds0)s;
dims[3] = xdim3 / %(ds1)s; dims[3] = xdim3 / %(ds1)s;
if (! %(ignore_border)s) if (! %(ignore_border)s)
...@@ -243,19 +243,19 @@ class GpuDownsampleFactorMax(Op): ...@@ -243,19 +243,19 @@ class GpuDownsampleFactorMax(Op):
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != dims[0]) || (CudaNdarray_HOST_DIMS(%(z)s)[0] != dims[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != dims[1]) || (CudaNdarray_HOST_DIMS(%(z)s)[1] != dims[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[2] != dims[2]) || (CudaNdarray_HOST_DIMS(%(z)s)[2] != dims[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(z)s)[3] != dims[3])) || (CudaNdarray_HOST_DIMS(%(z)s)[3] != dims[3]))
{ {
Py_XDECREF(cnda_%(z)s); Py_XDECREF(%(z)s);
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null(); %(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s) if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(cnda_%(z)s, 4, dims)) || CudaNdarray_alloc_contiguous(%(z)s, 4, dims))
{ {
Py_XDECREF(cnda_%(z)s); Py_XDECREF(%(z)s);
cnda_%(z)s = NULL; %(z)s = NULL;
PyErr_SetString(PyExc_ValueError, "Was not able to allocate output!"); PyErr_SetString(PyExc_ValueError, "Was not able to allocate output!");
%(fail)s; %(fail)s;
} }
...@@ -268,12 +268,12 @@ class GpuDownsampleFactorMax(Op): ...@@ -268,12 +268,12 @@ class GpuDownsampleFactorMax(Op):
if ((grid.x*grid.y) && dims[3]) if ((grid.x*grid.y) && dims[3])
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>( kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3, dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2], CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3], CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s)); CudaNdarray_DEV_DATA(%(z)s));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -372,57 +372,57 @@ class GpuDownsampleFactorMaxGrad(Op): ...@@ -372,57 +372,57 @@ class GpuDownsampleFactorMaxGrad(Op):
ds0, ds1 = self.ds ds0, ds1 = self.ds
ignore_border = int(self.ignore_border) ignore_border = int(self.ignore_border)
return """ return """
if (cnda_%(x)s->nd != 4 if (%(x)s->nd != 4
|| cnda_%(z)s->nd != 4 || %(z)s->nd != 4
|| cnda_%(gz)s->nd != 4) || %(gz)s->nd != 4)
{ {
PyErr_SetString(PyExc_ValueError, "rank error"); PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(gx)s) if ((NULL == %(gx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(gx)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]) || (CudaNdarray_HOST_DIMS(%(gx)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[2] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]) || (CudaNdarray_HOST_DIMS(%(gx)s)[2] != CudaNdarray_HOST_DIMS(%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(cnda_%(gx)s)[3] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[3])) || (CudaNdarray_HOST_DIMS(%(gx)s)[3] != CudaNdarray_HOST_DIMS(%(x)s)[3]))
{ {
Py_XDECREF(cnda_%(gx)s); Py_XDECREF(%(gx)s);
cnda_%(gx)s = (CudaNdarray*)CudaNdarray_new_null(); %(gx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(gx)s) if ((NULL == %(gx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(gx)s, 4, CudaNdarray_HOST_DIMS(cnda_%(x)s))) || CudaNdarray_alloc_contiguous(%(gx)s, 4, CudaNdarray_HOST_DIMS(%(x)s)))
{ {
Py_XDECREF(cnda_%(gx)s); Py_XDECREF(%(gx)s);
cnda_%(gx)s = NULL; %(gx)s = NULL;
%(fail)s; %(fail)s;
} }
} }
{ {
//TODO: implement this by supporting more //TODO: implement this by supporting more
//outputs than threads //outputs than threads
dim3 grid(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]); dim3 grid(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]);
dim3 block(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3]); dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]);
kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>( kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
CudaNdarray_HOST_DIMS(cnda_%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[1], CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[2], CudaNdarray_HOST_DIMS(%(z)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(z)s)[3], CudaNdarray_HOST_DIMS(%(z)s)[3],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2], CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3], CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0], CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[1], CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[2], CudaNdarray_HOST_STRIDES(%(z)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[3], CudaNdarray_HOST_STRIDES(%(z)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gz)s), CudaNdarray_DEV_DATA(%(gz)s),
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[0], CudaNdarray_HOST_STRIDES(%(gz)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[1], CudaNdarray_HOST_STRIDES(%(gz)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[2], CudaNdarray_HOST_STRIDES(%(gz)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(gz)s)[3], CudaNdarray_HOST_STRIDES(%(gz)s)[3],
CudaNdarray_DEV_DATA(cnda_%(gx)s)); CudaNdarray_DEV_DATA(%(gx)s));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
......
...@@ -885,20 +885,20 @@ class NaiveAlgo(object): ...@@ -885,20 +885,20 @@ class NaiveAlgo(object):
for iname in inputs: for iname in inputs:
print >> sio, """ print >> sio, """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != cnda_%(iname)s->nd) if (%(nd)s != %(iname)s->nd)
{ {
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd); PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);
%(fail)s; %(fail)s;
} }
for (int i = 0; i< %(nd)s; ++i) for (int i = 0; i< %(nd)s; ++i)
{ {
dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] : dims[i]; dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
if ((CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i])) if ((CudaNdarray_HOST_DIMS(%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
{ {
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i", PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
i, i,
CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i], CudaNdarray_HOST_DIMS(%(iname)s)[i],
dims[i] dims[i]
); );
%(fail)s; %(fail)s;
...@@ -909,31 +909,31 @@ class NaiveAlgo(object): ...@@ -909,31 +909,31 @@ class NaiveAlgo(object):
#check that all outputs have valid dimensions #check that all outputs have valid dimensions
for oname in outputs: for oname in outputs:
print >> sio, """ print >> sio, """
for (int i = 0; (i< %(nd)s) && (cnda_%(oname)s); ++i) { for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(oname)s)[i]) if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{ {
Py_DECREF(cnda_%(oname)s); Py_DECREF(%(oname)s);
cnda_%(oname)s = NULL; %(oname)s = NULL;
} }
} }
if (NULL == cnda_%(oname)s) if (NULL == %(oname)s)
{ {
cnda_%(oname)s = (CudaNdarray*)CudaNdarray_new_null(); %(oname)s = (CudaNdarray*)CudaNdarray_new_null();
if (!cnda_%(oname)s) if (!%(oname)s)
{ {
//error string already set //error string already set
%(fail)s; %(fail)s;
} }
if (CudaNdarray_alloc_contiguous(cnda_%(oname)s, %(nd)s, dims)) if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
{ {
//error string already set //error string already set
Py_DECREF(cnda_%(oname)s); Py_DECREF(%(oname)s);
cnda_%(oname)s = NULL; %(oname)s = NULL;
%(fail)s; %(fail)s;
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
print >> sio, """ print >> sio, """
{ {
...@@ -943,11 +943,11 @@ class NaiveAlgo(object): ...@@ -943,11 +943,11 @@ class NaiveAlgo(object):
""" % locals() """ % locals()
for iname in inputs: for iname in inputs:
print >> sio, """ print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_HOST_STRIDES(cnda_%(iname)s) , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
""" % locals() """ % locals()
for oname in outputs: for oname in outputs:
print >> sio, """ print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_HOST_STRIDES(cnda_%(oname)s) , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
""" % locals() """ % locals()
print >> sio, """ print >> sio, """
)) ))
...@@ -956,8 +956,8 @@ class NaiveAlgo(object): ...@@ -956,8 +956,8 @@ class NaiveAlgo(object):
""" """
for oname in outputs: for oname in outputs:
print >> sio, """ print >> sio, """
Py_DECREF(cnda_%(oname)s); Py_DECREF(%(oname)s);
cnda_%(oname)s = NULL; %(oname)s = NULL;
""" % locals() """ % locals()
print >> sio, """ print >> sio, """
%(fail)s; %(fail)s;
......
...@@ -81,60 +81,60 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op): ...@@ -81,60 +81,60 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
if (cnda_%(y_idx)s->nd != 1) if (%(y_idx)s->nd != 1)
{ {
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor"); PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s; %(fail)s;
} }
if (cnda_%(x)s->nd != 2) if (%(x)s->nd != 2)
{ {
PyErr_SetString(PyExc_ValueError, "x not 2d tensor"); PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s; %(fail)s;
} }
if (cnda_%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_SetString(PyExc_ValueError, "b not 1d tensor"); PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]) if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments"); PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(cnda_%(x)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(b)s)[0]) if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments"); PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments");
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(nll)s) //initial condition if ((NULL == %(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(cnda_%(nll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])) || (CudaNdarray_HOST_DIMS(%(nll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{ {
Py_XDECREF(cnda_%(nll)s); Py_XDECREF(%(nll)s);
cnda_%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)); %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!cnda_%(nll)s) if(!%(nll)s)
{ {
%(fail)s; %(fail)s;
} }
} }
if ((NULL == cnda_%(sm)s) if ((NULL == %(sm)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(sm)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[1])) || (CudaNdarray_HOST_DIMS(%(sm)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
{ {
Py_XDECREF(cnda_%(sm)s); Py_XDECREF(%(sm)s);
cnda_%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(cnda_%(x)s)); %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s));
if(!cnda_%(sm)s) if(!%(sm)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should pick it up. // no need to decref cnda_nll, the cleanup code should pick it up.
%(fail)s; %(fail)s;
} }
} }
if ((NULL == cnda_%(am)s) if ((NULL == %(am)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(am)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0])) || (CudaNdarray_HOST_DIMS(%(am)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{ {
Py_XDECREF(cnda_%(am)s); Py_XDECREF(%(am)s);
cnda_%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)); %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!cnda_%(am)s) if(!%(am)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc am output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc am output");
// no need to decref nll amd sm, the cleanup code should pick it up. // no need to decref nll amd sm, the cleanup code should pick it up.
...@@ -142,19 +142,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op): ...@@ -142,19 +142,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
} }
} }
{ {
int n_blocks = CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0]; int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions. int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
int n_shared_bytes = 0; //n_threads * sizeof(float); int n_shared_bytes = 0; //n_threads * sizeof(float);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>( k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s), CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_HOST_STRIDES(cnda_%(b)s)[0], CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0], CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(nll)s), CudaNdarray_HOST_STRIDES(cnda_%(nll)s)[0], CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1], CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(am)s), CudaNdarray_HOST_STRIDES(cnda_%(am)s)[0]); CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)
...@@ -192,58 +192,58 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -192,58 +192,58 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub): def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
fail = sub['fail'] fail = sub['fail']
return """ return """
if ((cnda_%(dnll)s->nd != 1) if ((%(dnll)s->nd != 1)
|| (cnda_%(sm)s->nd != 2) || (%(sm)s->nd != 2)
|| (cnda_%(y_idx)s->nd != 1)) || (%(y_idx)s->nd != 1))
{ {
PyErr_SetString(PyExc_ValueError, "rank error"); PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0]) if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
{ {
PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i", PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0],CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0]); CudaNdarray_HOST_DIMS(%(dnll)s)[0],CudaNdarray_HOST_DIMS(%(sm)s)[0]);
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(cnda_%(dnll)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(y_idx)s)[0]) if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]"); PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]");
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(dx)s) if ((NULL == %(dx)s)
|| (CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[0]) || (CudaNdarray_HOST_DIMS(%(dx)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
|| (CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(sm)s)[1])) || (CudaNdarray_HOST_DIMS(%(dx)s)[1] != CudaNdarray_HOST_DIMS(%(sm)s)[1]))
{ {
Py_XDECREF(cnda_%(dx)s); Py_XDECREF(%(dx)s);
cnda_%(dx)s = (CudaNdarray*)CudaNdarray_new_null(); %(dx)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(dx)s) if ((NULL == %(dx)s)
|| CudaNdarray_alloc_contiguous(cnda_%(dx)s, 2, CudaNdarray_HOST_DIMS(cnda_%(sm)s))) || CudaNdarray_alloc_contiguous(%(dx)s, 2, CudaNdarray_HOST_DIMS(%(sm)s)))
{ {
Py_XDECREF(cnda_%(dx)s); Py_XDECREF(%(dx)s);
cnda_%(dx)s = NULL; %(dx)s = NULL;
%(fail)s; %(fail)s;
} }
} }
{ {
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<< <<<
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1] CudaNdarray_HOST_DIMS(%(dx)s)[1]
>>>( >>>(
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(dx)s)[1], CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_DEV_DATA(cnda_%(dnll)s), CudaNdarray_DEV_DATA(%(dnll)s),
CudaNdarray_HOST_STRIDES(cnda_%(dnll)s)[0], CudaNdarray_HOST_STRIDES(%(dnll)s)[0],
CudaNdarray_DEV_DATA(cnda_%(sm)s), CudaNdarray_DEV_DATA(%(sm)s),
CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(sm)s)[1], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(cnda_%(y_idx)s), CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(cnda_%(y_idx)s)[0], CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(cnda_%(dx)s) //guaranteed c-contiguous CudaNdarray_DEV_DATA(%(dx)s) //guaranteed c-contiguous
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
...@@ -253,7 +253,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -253,7 +253,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
%(fail)s; %(fail)s;
} }
} }
assert(cnda_%(dx)s); assert(%(dx)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
......
...@@ -132,10 +132,10 @@ class CudaNdarrayType(Type): ...@@ -132,10 +132,10 @@ class CudaNdarrayType(Type):
def c_declare(self, name, sub): def c_declare(self, name, sub):
ndim = self.ndim ndim = self.ndim
c_typename = self.dtype_specs()[1] c_typename = self.dtype_specs()[1]
return """ CudaNdarray * cnda_%(name)s;""" %locals() return """ CudaNdarray * %(name)s;""" %locals()
def c_init(self, name, sub): def c_init(self, name, sub):
return "cnda_%(name)s = NULL;" % locals() return "%(name)s = NULL;" % locals()
def c_extract(self, name, sub): def c_extract(self, name, sub):
sio = StringIO.StringIO() sio = StringIO.StringIO()
...@@ -148,61 +148,61 @@ class CudaNdarrayType(Type): ...@@ -148,61 +148,61 @@ class CudaNdarrayType(Type):
if (CudaNdarray_Check(py_%(name)s)) if (CudaNdarray_Check(py_%(name)s))
{ {
//fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
cnda_%(name)s = (CudaNdarray*)py_%(name)s; %(name)s = (CudaNdarray*)py_%(name)s;
//std::cerr << "c_extract " << cnda_%(name)s << '\\n'; //std::cerr << "c_extract " << %(name)s << '\\n';
if (cnda_%(name)s->nd != %(nd)s) if (%(name)s->nd != %(nd)s)
{ {
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", cnda_%(name)s->nd); PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", %(name)s->nd);
cnda_%(name)s = NULL; %(name)s = NULL;
%(fail)s; %(fail)s;
} }
//std::cerr << "c_extract " << cnda_%(name)s << " nd check passed\\n"; //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
""" %locals() """ %locals()
for i, b in enumerate(self.broadcastable): for i, b in enumerate(self.broadcastable):
if b: if b:
print >> sio, """ print >> sio, """
if (CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s] != 1) if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
{ {
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s], %(i)s); PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL; %(name)s = NULL;
%(fail)s; %(fail)s;
} }
//std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n"; //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
//std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n"; //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
//std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n"; //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
if (CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s]) if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
{ {
//std::cerr << "c_extract bad stride detected...\\n"; //std::cerr << "c_extract bad stride detected...\\n";
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s], %(i)s); PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL; %(name)s = NULL;
%(fail)s; %(fail)s;
} }
//std::cerr << "c_extract " << cnda_%(name)s << "bcast check %(i)s passed\\n"; //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
""" %locals() """ %locals()
print >> sio, """ print >> sio, """
assert(cnda_%(name)s); assert(%(name)s);
Py_INCREF(py_%(name)s); Py_INCREF(py_%(name)s);
} }
else else
{ {
//fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray"); PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
cnda_%(name)s = NULL; %(name)s = NULL;
%(fail)s; %(fail)s;
} }
//std::cerr << "c_extract done " << cnda_%(name)s << '\\n'; //std::cerr << "c_extract done " << %(name)s << '\\n';
""" % locals() """ % locals()
#print sio.getvalue() #print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def c_cleanup(self, name, sub): def c_cleanup(self, name, sub):
return """ return """
//std::cerr << "cleanup " << py_%(name)s << " " << cnda_%(name)s << "\\n"; //std::cerr << "cleanup " << py_%(name)s << " " << %(name)s << "\\n";
//fprintf(stderr, "c_cleanup CNDA py_object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); //fprintf(stderr, "c_cleanup CNDA py_object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
if (cnda_%(name)s) if (%(name)s)
{ {
//fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %%p %%i\\n", cnda_%(name)s, (cnda_%(name)s->ob_refcnt)); //fprintf(stderr, "c_cleanup CNDA cn_object w refcnt %%p %%i\\n", %(name)s, (%(name)s->ob_refcnt));
Py_XDECREF(cnda_%(name)s); Py_XDECREF(%(name)s);
} }
//std::cerr << "cleanup done" << py_%(name)s << "\\n"; //std::cerr << "cleanup done" << py_%(name)s << "\\n";
""" % locals() """ % locals()
...@@ -211,7 +211,7 @@ class CudaNdarrayType(Type): ...@@ -211,7 +211,7 @@ class CudaNdarrayType(Type):
"""Override `CLinkerOp.c_sync` """ """Override `CLinkerOp.c_sync` """
return """ return """
//std::cerr << "sync\\n"; //std::cerr << "sync\\n";
if (NULL == cnda_%(name)s) { if (NULL == %(name)s) {
// failure: sync None to storage // failure: sync None to storage
Py_XDECREF(py_%(name)s); Py_XDECREF(py_%(name)s);
py_%(name)s = Py_None; py_%(name)s = Py_None;
...@@ -219,10 +219,10 @@ class CudaNdarrayType(Type): ...@@ -219,10 +219,10 @@ class CudaNdarrayType(Type):
} }
else else
{ {
if (py_%(name)s != (PyObject*)cnda_%(name)s) if (py_%(name)s != (PyObject*)%(name)s)
{ {
Py_XDECREF(py_%(name)s); Py_XDECREF(py_%(name)s);
py_%(name)s = (PyObject*)cnda_%(name)s; py_%(name)s = (PyObject*)%(name)s;
Py_INCREF(py_%(name)s); Py_INCREF(py_%(name)s);
} }
assert(py_%(name)s->ob_refcnt); assert(py_%(name)s->ob_refcnt);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论