提交 032c5b14 authored 作者: David Warde-Farley's avatar David Warde-Farley

Whitespace fixes.

上级 9d85fda7
...@@ -150,15 +150,15 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op): ...@@ -150,15 +150,15 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>( k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0], CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0], CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0], CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1], CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]); CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
// no need to decref output vars the cleanup code should pick them up. // no need to decref output vars the cleanup code should pick them up.
...@@ -233,7 +233,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -233,7 +233,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256) std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1], CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_DEV_DATA(%(dnll)s), CudaNdarray_DEV_DATA(%(dnll)s),
CudaNdarray_HOST_STRIDES(%(dnll)s)[0], CudaNdarray_HOST_STRIDES(%(dnll)s)[0],
...@@ -249,11 +249,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -249,11 +249,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", cudaGetErrorString(err));
%(fail)s; %(fail)s;
} }
} }
assert(%(dx)s); assert(%(dx)s);
""" % locals() """ % locals()
...@@ -337,7 +337,7 @@ class GpuSoftmax (Op): ...@@ -337,7 +337,7 @@ class GpuSoftmax (Op):
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float) CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -347,18 +347,18 @@ class GpuSoftmax (Op): ...@@ -347,18 +347,18 @@ class GpuSoftmax (Op):
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s; %(fail)s;
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s"%nodename, return nvcc_kernel("kSoftmax_%s"%nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm'], 'float * sm'],
body=[ body=[
...@@ -436,7 +436,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -436,7 +436,7 @@ class GpuSoftmaxWithBias (Op):
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float) CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>( >>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -449,18 +449,18 @@ class GpuSoftmaxWithBias (Op): ...@@ -449,18 +449,18 @@ class GpuSoftmaxWithBias (Op):
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s; %(fail)s;
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmaxWithBias_%s"%nodename, return nvcc_kernel("kSoftmaxWithBias_%s"%nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0', 'const float * b', 'const int sb0',
'float * sm'], 'float * sm'],
......
...@@ -18,7 +18,7 @@ grad_todo = GradTodo() ...@@ -18,7 +18,7 @@ grad_todo = GradTodo()
class FFT(Op): class FFT(Op):
"""Fast Fourier Transform """Fast Fourier Transform
.. TODO: .. TODO:
The current implementation just works for matrix inputs, and permits taking a 1D FFT over The current implementation just works for matrix inputs, and permits taking a 1D FFT over
either rows or columns. Add support for N-D FFTs as provided by either numpy or FFTW either rows or columns. Add support for N-D FFTs as provided by either numpy or FFTW
...@@ -29,7 +29,7 @@ class FFT(Op): ...@@ -29,7 +29,7 @@ class FFT(Op):
.. TODO: .. TODO:
unit tests. unit tests.
""" """
default_output = 0 default_output = 0
...@@ -61,7 +61,7 @@ class FFT(Op): ...@@ -61,7 +61,7 @@ class FFT(Op):
raise TypeError('Argument to HalfFFT must not be complex', frames) raise TypeError('Argument to HalfFFT must not be complex', frames)
spectrogram = tensor.zmatrix() spectrogram = tensor.zmatrix()
buf = generic() buf = generic()
# The `buf` output is present for future work # The `buf` output is present for future work
# when we call FFTW directly and re-use the 'plan' that FFTW creates. # when we call FFTW directly and re-use the 'plan' that FFTW creates.
# In that case, buf would store a CObject encapsulating the plan. # In that case, buf would store a CObject encapsulating the plan.
rval = Apply(self, [_frames, _n, _axis], [spectrogram, buf]) rval = Apply(self, [_frames, _n, _axis], [spectrogram, buf])
......
...@@ -25,13 +25,13 @@ class Minimal(gof.Op): ...@@ -25,13 +25,13 @@ class Minimal(gof.Op):
return hash(type(self)) return hash(type(self))
def make_node(self, *args): def make_node(self, *args):
# HERE `args` must be THEANO VARIABLES # HERE `args` must be THEANO VARIABLES
return gof.Apply(op=self, inputs=args, outputs=[tensor.lscalar()]) return gof.Apply(op=self, inputs=args, outputs=[tensor.lscalar()])
def perform(self, node, inputs, (output, )): def perform(self, node, inputs, (output, )):
# HERE `inputs` are PYTHON OBJECTS # HERE `inputs` are PYTHON OBJECTS
# do what you want here, # do what you want here,
# but do not modify any of the arguments [inplace]. # but do not modify any of the arguments [inplace].
print "perform got %i arguments" % len(inputs) print "perform got %i arguments" % len(inputs)
......
...@@ -59,12 +59,12 @@ class Multinomial(Op): ...@@ -59,12 +59,12 @@ class Multinomial(Op):
npy_intp dims[2]; npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0]; dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1]; dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2, %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims, dims,
type_num_%(pvals)s, type_num_%(pvals)s,
0); 0);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
...@@ -96,7 +96,7 @@ class Multinomial(Op): ...@@ -96,7 +96,7 @@ class Multinomial(Op):
} }
} }
} }
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
multinomial = Multinomial() multinomial = Multinomial()
...@@ -128,24 +128,24 @@ class GpuMultinomial(Multinomial): ...@@ -128,24 +128,24 @@ class GpuMultinomial(Multinomial):
float * global_unis, float * global_unis,
float * global_outs float * global_outs
) )
{ {
int n = blockDim.x*blockIdx.x + threadIdx.x; int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi) if (n < nb_multi)
{ {
float cummul = 0.; float cummul = 0.;
bool done = false; bool done = false;
for (int m = 0; m < nb_outcomes; ++m) for (int m = 0; m < nb_outcomes; ++m)
{ {
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides]; cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.; float current_out = 0.;
if (!done && global_unis[n] < cummul) if (!done && global_unis[n] < cummul)
{ {
current_out = 1.; current_out = 1.;
done = true; done = true;
} }
global_outs[n + m * nb_multi] = current_out; global_outs[n + m * nb_multi] = current_out;
} }
} }
...@@ -157,7 +157,7 @@ class GpuMultinomial(Multinomial): ...@@ -157,7 +157,7 @@ class GpuMultinomial(Multinomial):
def c_code(self, node, name, (pvals, unis), (z,), sub): def c_code(self, node, name, (pvals, unis), (z,), sub):
fail = sub['fail'] fail = sub['fail']
return """ return """
if (%(pvals)s->nd != 2) if (%(pvals)s->nd != 2)
{ {
PyErr_Format(PyExc_TypeError, "pvals wrong rank"); PyErr_Format(PyExc_TypeError, "pvals wrong rank");
...@@ -168,7 +168,7 @@ class GpuMultinomial(Multinomial): ...@@ -168,7 +168,7 @@ class GpuMultinomial(Multinomial):
PyErr_Format(PyExc_TypeError, "unis wrong rank"); PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]) if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{ {
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]"); PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
...@@ -201,7 +201,7 @@ class GpuMultinomial(Multinomial): ...@@ -201,7 +201,7 @@ class GpuMultinomial(Multinomial):
{ // NESTED SCOPE { // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0]; int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1]; int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
//TODO : change this for a beautiful constant //TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1; int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1; int nb_blocks = max_nb_blocks + 1;
...@@ -212,7 +212,7 @@ class GpuMultinomial(Multinomial): ...@@ -212,7 +212,7 @@ class GpuMultinomial(Multinomial):
if (nb_multi %% nb_threads == 0) if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads; nb_blocks = nb_multi/nb_threads;
else else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.); nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks); } while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads); //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
...@@ -224,7 +224,7 @@ class GpuMultinomial(Multinomial): ...@@ -224,7 +224,7 @@ class GpuMultinomial(Multinomial):
%(fail)s; %(fail)s;
} }
dim3 n_blocks(nb_blocks,1,1); dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1); dim3 n_threads(nb_threads,1,1);
int n_shared = 0; int n_shared = 0;
...@@ -240,7 +240,7 @@ class GpuMultinomial(Multinomial): ...@@ -240,7 +240,7 @@ class GpuMultinomial(Multinomial):
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n", PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s", "k_multi_warp_%(name)s",
...@@ -264,4 +264,4 @@ def use_gpu_multinomial(node): ...@@ -264,4 +264,4 @@ def use_gpu_multinomial(node):
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))] return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if cuda_enabled:#theano.config.device.startswith('gpu'): if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial) register_specialize(use_gpu_multinomial)
...@@ -7,7 +7,7 @@ import numpy ...@@ -7,7 +7,7 @@ import numpy
import __builtin__ import __builtin__
class NeighbourhoodsFromImages(Op): class NeighbourhoodsFromImages(Op):
def __init__(self, n_dims_before, dims_neighbourhoods, def __init__(self, n_dims_before, dims_neighbourhoods,
strides=None, ignore_border=False, inverse=False): strides=None, ignore_border=False, inverse=False):
""" """
This extracts neighbourhoods from "images", but in a This extracts neighbourhoods from "images", but in a
...@@ -65,10 +65,10 @@ class NeighbourhoodsFromImages(Op): ...@@ -65,10 +65,10 @@ class NeighbourhoodsFromImages(Op):
""" """
self.n_dims_before = n_dims_before self.n_dims_before = n_dims_before
self.dims_neighbourhoods = dims_neighbourhoods self.dims_neighbourhoods = dims_neighbourhoods
if not strides is None: if not strides is None:
self.strides = strides self.strides = strides
else: else:
self.strides = dims_neighbourhoods self.strides = dims_neighbourhoods
self.ignore_border = ignore_border self.ignore_border = ignore_border
self.inverse = inverse self.inverse = inverse
...@@ -99,7 +99,7 @@ class NeighbourhoodsFromImages(Op): ...@@ -99,7 +99,7 @@ class NeighbourhoodsFromImages(Op):
def __str__(self): def __str__(self):
return '%s{%s,%s,%s,%s}' % \ return '%s{%s,%s,%s,%s}' % \
(self.__class__.__name__, (self.__class__.__name__,
self.n_dims_before, self.n_dims_before,
self.dims_neighbourhoods, self.dims_neighbourhoods,
self.strides, self.strides,
...@@ -135,7 +135,7 @@ class NeighbourhoodsFromImages(Op): ...@@ -135,7 +135,7 @@ class NeighbourhoodsFromImages(Op):
# the number of strides performed by NeighFromImg is # the number of strides performed by NeighFromImg is
# directly given by this shape # directly given by this shape
num_strides.append(output_shape[self.n_dims_before + i]) num_strides.append(output_shape[self.n_dims_before + i])
# our Op's output image must be at least this wide # our Op's output image must be at least this wide
at_least_width = num_strides[i] * self.strides[i] at_least_width = num_strides[i] * self.strides[i]
...@@ -231,7 +231,7 @@ class NeighbourhoodsFromImages(Op): ...@@ -231,7 +231,7 @@ class NeighbourhoodsFromImages(Op):
("for neigh_idx_%d in xrange(min(max_neigh_idx_%d,"\ ("for neigh_idx_%d in xrange(min(max_neigh_idx_%d,"\
+" self.dims_neighbourhoods[%d])):\n") % \ +" self.dims_neighbourhoods[%d])):\n") % \
(inner_dim_no, inner_dim_no, inner_dim_no) (inner_dim_no, inner_dim_no, inner_dim_no)
return code_before return code_before
def _py_flattened_idx(self): def _py_flattened_idx(self):
...@@ -268,8 +268,8 @@ class NeighbourhoodsFromImages(Op): ...@@ -268,8 +268,8 @@ class NeighbourhoodsFromImages(Op):
class ImagesFromNeighbourhoods(NeighbourhoodsFromImages): class ImagesFromNeighbourhoods(NeighbourhoodsFromImages):
def __init__(self, n_dims_before, dims_neighbourhoods, def __init__(self, n_dims_before, dims_neighbourhoods,
strides=None, ignore_border=False): strides=None, ignore_border=False):
NeighbourhoodsFromImages.__init__(self,n_dims_before, dims_neighbourhoods, NeighbourhoodsFromImages.__init__(self,n_dims_before, dims_neighbourhoods,
strides=strides, ignore_border=ignore_border, strides=strides, ignore_border=ignore_border,
inverse=True) inverse=True)
# and that's all there is to it # and that's all there is to it
...@@ -88,7 +88,7 @@ class Images2Neibs(Op): ...@@ -88,7 +88,7 @@ class Images2Neibs(Op):
PyErr_Format(PyExc_TypeError, "neib_step wrong step ; has to contain 2 elements"); PyErr_Format(PyExc_TypeError, "neib_step wrong step ; has to contain 2 elements");
%(fail)s; %(fail)s;
} }
// (c,d) = neib_shape // (c,d) = neib_shape
const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0); const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1); const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
...@@ -137,7 +137,7 @@ class Images2Neibs(Op): ...@@ -137,7 +137,7 @@ class Images2Neibs(Op):
* grid_d * grid_d
* (%(ten4)s->dimensions)[1] * (%(ten4)s->dimensions)[1]
* (%(ten4)s->dimensions)[0]; * (%(ten4)s->dimensions)[0];
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != z_dim0 ) || ((%(z)s->dimensions)[0] != z_dim0 )
|| ((%(z)s->dimensions)[1] != z_dim1 ) || ((%(z)s->dimensions)[1] != z_dim1 )
...@@ -147,12 +147,12 @@ class Images2Neibs(Op): ...@@ -147,12 +147,12 @@ class Images2Neibs(Op):
npy_intp dims[2]; npy_intp dims[2];
dims[0] = z_dim0; dims[0] = z_dim0;
dims[1] = z_dim1; dims[1] = z_dim1;
%(z)s = (PyArrayObject*) PyArray_EMPTY(2, %(z)s = (PyArrayObject*) PyArray_EMPTY(2,
dims, dims,
type_num_%(ten4)s, type_num_%(ten4)s,
0); 0);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output"); PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
...@@ -162,12 +162,12 @@ class Images2Neibs(Op): ...@@ -162,12 +162,12 @@ class Images2Neibs(Op):
} }
{ // NESTED SCOPE { // NESTED SCOPE
const int nb_batch = (%(ten4)s->dimensions)[0]; const int nb_batch = (%(ten4)s->dimensions)[0];
const int nb_stack = (%(ten4)s->dimensions)[1]; const int nb_stack = (%(ten4)s->dimensions)[1];
const int height = (%(ten4)s->dimensions)[2]; const int height = (%(ten4)s->dimensions)[2];
const int width = (%(ten4)s->dimensions)[3]; const int width = (%(ten4)s->dimensions)[3];
// (c,d) = neib_shape // (c,d) = neib_shape
const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0); const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1); const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
...@@ -177,7 +177,7 @@ class Images2Neibs(Op): ...@@ -177,7 +177,7 @@ class Images2Neibs(Op):
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
// Oh this is messed up... // Oh this is messed up...
for (int n = 0; n < nb_batch; n++) // loop over batches for (int n = 0; n < nb_batch; n++) // loop over batches
for (int s = 0; s < nb_stack; s++) // loop over stacks for (int s = 0; s < nb_stack; s++) // loop over stacks
for (int a = 0; a < grid_c; a++) // loop over the number of patch in height for (int a = 0; a < grid_c; a++) // loop over the number of patch in height
...@@ -194,18 +194,18 @@ class Images2Neibs(Op): ...@@ -194,18 +194,18 @@ class Images2Neibs(Op):
} }
for (int j = 0; j < d; j++) // loop over d for (int j = 0; j < d; j++) // loop over d
{ {
int ten4_3 = j + b * step_y; int ten4_3 = j + b * step_y;
if ( "%(mode)s" == "wrap_centered" ){ if ( "%(mode)s" == "wrap_centered" ){
ten4_3 -= wrap_centered_idx_shift_y; ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width; if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width; else if (ten4_3 >= width) ten4_3 -= width;
} }
int z_col = j + d * i; int z_col = j + d * i;
dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col); dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col);
*curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3)); *curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3));
//printf("\\n(%%i,%%i,%%i,%%i) --> (%%i,%%i)",n,s, ten4_2, ten4_3, z_row, z_col); //printf("\\n(%%i,%%i,%%i,%%i) --> (%%i,%%i)",n,s, ten4_2, ten4_3, z_row, z_col);
//printf("%%f ", *curr_z); //printf("%%f ", *curr_z);
} }
...@@ -220,22 +220,22 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'): ...@@ -220,22 +220,22 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
def neibs2images(neibs, neib_shape, original_shape): def neibs2images(neibs, neib_shape, original_shape):
""" """
Inverse of images2neib. Inverse of images2neib.
neibs : matrix like the one obtained by images2neib neibs : matrix like the one obtained by images2neib
neib_shape : neib_shape that was used in images2neib neib_shape : neib_shape that was used in images2neib
original_shape : original shape of the 4d tensor given to images2neib original_shape : original shape of the 4d tensor given to images2neib
Return a 4d tensor of shape `original_shape`. Return a 4d tensor of shape `original_shape`.
""" """
neibs = T.as_tensor_variable(neibs) neibs = T.as_tensor_variable(neibs)
neib_shape = T.as_tensor_variable(neib_shape) neib_shape = T.as_tensor_variable(neib_shape)
original_shape = T.as_tensor_variable(original_shape) original_shape = T.as_tensor_variable(original_shape)
new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] ) new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] )
return images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape).reshape(original_shape) return images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape).reshape(original_shape)
#return images2neibs(neibs.reshape((1,1,neibs.shape[0],neibs.shape[1])), new_neib_shape).reshape(original_shape) #return images2neibs(neibs.reshape((1,1,neibs.shape[0],neibs.shape[1])), new_neib_shape).reshape(original_shape)
# This is work in progress # This is work in progress
class GpuImages2Neibs(Images2Neibs): class GpuImages2Neibs(Images2Neibs):
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
...@@ -251,7 +251,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -251,7 +251,7 @@ class GpuImages2Neibs(Images2Neibs):
assert ten4.ndim==4 assert ten4.ndim==4
assert neib_shape.ndim==1 assert neib_shape.ndim==1
assert neib_step.ndim==1 assert neib_step.ndim==1
return Apply(self, [ten4, neib_shape, neib_step], [CudaNdarrayType(broadcastable=(False,False), return Apply(self, [ten4, neib_shape, neib_step], [CudaNdarrayType(broadcastable=(False,False),
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
...@@ -313,8 +313,8 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -313,8 +313,8 @@ class GpuImages2Neibs(Images2Neibs):
} }
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n)); //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i; int z_col = j + d * i;
int z_idx = z_col + c*d*z_row; int z_idx = z_col + c*d*z_row;
...@@ -375,8 +375,8 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -375,8 +375,8 @@ class GpuImages2Neibs(Images2Neibs):
} }
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n)); //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i; int z_col = j + d * i;
int z_idx = z_col + c*d*z_row; int z_idx = z_col + c*d*z_row;
...@@ -406,7 +406,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -406,7 +406,7 @@ class GpuImages2Neibs(Images2Neibs):
PyErr_Format(PyExc_TypeError, "unis wrong rank"); PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s; %(fail)s;
} }
if (%(neib_shape)s->dimensions[0] != 2) if (%(neib_shape)s->dimensions[0] != 2)
{ {
PyErr_Format(PyExc_ValueError, "neib_shape has to contain two elements"); PyErr_Format(PyExc_ValueError, "neib_shape has to contain two elements");
...@@ -459,7 +459,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -459,7 +459,7 @@ class GpuImages2Neibs(Images2Neibs):
* grid_d * grid_d
* CudaNdarray_HOST_DIMS(%(ten4)s)[1] * CudaNdarray_HOST_DIMS(%(ten4)s)[1]
* CudaNdarray_HOST_DIMS(%(ten4)s)[0]; * CudaNdarray_HOST_DIMS(%(ten4)s)[0];
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0) || (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1)) || (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
...@@ -475,11 +475,11 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -475,11 +475,11 @@ class GpuImages2Neibs(Images2Neibs):
%(fail)s; %(fail)s;
} }
} }
} }
{ // NESTED SCOPE { // NESTED SCOPE
const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0]; const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1]; const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2]; const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
...@@ -489,11 +489,11 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -489,11 +489,11 @@ class GpuImages2Neibs(Images2Neibs):
const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1); const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1); dim3 n_threads(d,c,1);
//Their is a max of 512 threads per blocks //Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and a higher core utilisation. //Make bigger block to have better memory access pattern and a higher core utilisation.
//for smaller patch size //for smaller patch size
...@@ -519,7 +519,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -519,7 +519,7 @@ class GpuImages2Neibs(Images2Neibs):
f = k_multi_warp_%(name)s; f = k_multi_warp_%(name)s;
} }
f<<<n_blocks, n_threads, n_shared>>>( f<<<n_blocks, n_threads, n_shared>>>(
nb_batch, nb_batch,
nb_stack, nb_stack,
height, width, height, width,
...@@ -534,7 +534,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -534,7 +534,7 @@ class GpuImages2Neibs(Images2Neibs):
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n", PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s", "k_multi_warp_%(name)s",
...@@ -560,4 +560,4 @@ def use_gpu_images2neibs(node): ...@@ -560,4 +560,4 @@ def use_gpu_images2neibs(node):
if cuda_available: if cuda_available:
register_gpu_opt()(use_gpu_images2neibs) register_gpu_opt()(use_gpu_images2neibs)
...@@ -60,7 +60,7 @@ class TrueDot(gof.op.Op): ...@@ -60,7 +60,7 @@ class TrueDot(gof.op.Op):
if self.grad_preserves_dense: if self.grad_preserves_dense:
rval[1] = dense_from_sparse(rval[1]) rval[1] = dense_from_sparse(rval[1])
return rval return rval
def true_dot(x, y, grad_preserves_dense=True): def true_dot(x, y, grad_preserves_dense=True):
""" """
@todo: Maybe the triple-transposition formulation (when x is dense) @todo: Maybe the triple-transposition formulation (when x is dense)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论