提交 032c5b14 authored 作者: David Warde-Farley's avatar David Warde-Farley

Whitespace fixes.

上级 9d85fda7
......@@ -150,15 +150,15 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err));
// no need to decref output vars the cleanup code should pick them up.
......@@ -233,7 +233,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_DEV_DATA(%(dnll)s),
CudaNdarray_HOST_STRIDES(%(dnll)s)[0],
......@@ -249,11 +249,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
}
assert(%(dx)s);
""" % locals()
......@@ -337,7 +337,7 @@ class GpuSoftmax (Op):
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
......@@ -347,18 +347,18 @@ class GpuSoftmax (Op):
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s"%nodename,
params=['int M', 'int N',
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm'],
body=[
......@@ -436,7 +436,7 @@ class GpuSoftmaxWithBias (Op):
CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
......@@ -449,18 +449,18 @@ class GpuSoftmaxWithBias (Op):
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
}
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmaxWithBias_%s"%nodename,
params=['int M', 'int N',
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm'],
......
......@@ -18,7 +18,7 @@ grad_todo = GradTodo()
class FFT(Op):
"""Fast Fourier Transform
.. TODO:
The current implementation just works for matrix inputs, and permits taking a 1D FFT over
either rows or columns. Add support for N-D FFTs as provided by either numpy or FFTW
......@@ -29,7 +29,7 @@ class FFT(Op):
.. TODO:
unit tests.
"""
default_output = 0
......@@ -61,7 +61,7 @@ class FFT(Op):
raise TypeError('Argument to HalfFFT must not be complex', frames)
spectrogram = tensor.zmatrix()
buf = generic()
# The `buf` output is present for future work
# The `buf` output is present for future work
# when we call FFTW directly and re-use the 'plan' that FFTW creates.
# In that case, buf would store a CObject encapsulating the plan.
rval = Apply(self, [_frames, _n, _axis], [spectrogram, buf])
......
......@@ -25,13 +25,13 @@ class Minimal(gof.Op):
return hash(type(self))
def make_node(self, *args):
# HERE `args` must be THEANO VARIABLES
# HERE `args` must be THEANO VARIABLES
return gof.Apply(op=self, inputs=args, outputs=[tensor.lscalar()])
def perform(self, node, inputs, (output, )):
# HERE `inputs` are PYTHON OBJECTS
# do what you want here,
# do what you want here,
# but do not modify any of the arguments [inplace].
print "perform got %i arguments" % len(inputs)
......
......@@ -59,12 +59,12 @@ class Multinomial(Op):
npy_intp dims[2];
dims[0] = (%(pvals)s->dimensions)[0];
dims[1] = (%(pvals)s->dimensions)[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(2,
dims,
type_num_%(pvals)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
......@@ -96,7 +96,7 @@ class Multinomial(Op):
}
}
}
} // END NESTED SCOPE
""" % locals()
multinomial = Multinomial()
......@@ -128,24 +128,24 @@ class GpuMultinomial(Multinomial):
float * global_unis,
float * global_outs
)
{
{
int n = blockDim.x*blockIdx.x + threadIdx.x;
if (n < nb_multi)
{
{
float cummul = 0.;
bool done = false;
for (int m = 0; m < nb_outcomes; ++m)
{
cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
float current_out = 0.;
if (!done && global_unis[n] < cummul)
{
current_out = 1.;
done = true;
}
}
global_outs[n + m * nb_multi] = current_out;
}
}
......@@ -157,7 +157,7 @@ class GpuMultinomial(Multinomial):
def c_code(self, node, name, (pvals, unis), (z,), sub):
fail = sub['fail']
return """
if (%(pvals)s->nd != 2)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
......@@ -168,7 +168,7 @@ class GpuMultinomial(Multinomial):
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
{
PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
......@@ -201,7 +201,7 @@ class GpuMultinomial(Multinomial):
{ // NESTED SCOPE
int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
//TODO : change this for a beautiful constant
int max_nb_blocks = 2<<15 - 1;
int nb_blocks = max_nb_blocks + 1;
......@@ -212,7 +212,7 @@ class GpuMultinomial(Multinomial):
if (nb_multi %% nb_threads == 0)
nb_blocks = nb_multi/nb_threads;
else
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
} while (nb_blocks > max_nb_blocks);
//printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
......@@ -224,7 +224,7 @@ class GpuMultinomial(Multinomial):
%(fail)s;
}
dim3 n_blocks(nb_blocks,1,1);
dim3 n_threads(nb_threads,1,1);
int n_shared = 0;
......@@ -240,7 +240,7 @@ class GpuMultinomial(Multinomial):
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
......@@ -264,4 +264,4 @@ def use_gpu_multinomial(node):
return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
if cuda_enabled:#theano.config.device.startswith('gpu'):
register_specialize(use_gpu_multinomial)
......@@ -7,7 +7,7 @@ import numpy
import __builtin__
class NeighbourhoodsFromImages(Op):
def __init__(self, n_dims_before, dims_neighbourhoods,
def __init__(self, n_dims_before, dims_neighbourhoods,
strides=None, ignore_border=False, inverse=False):
"""
This extracts neighbourhoods from "images", but in a
......@@ -65,10 +65,10 @@ class NeighbourhoodsFromImages(Op):
"""
self.n_dims_before = n_dims_before
self.dims_neighbourhoods = dims_neighbourhoods
if not strides is None:
self.strides = strides
else:
self.strides = dims_neighbourhoods
if not strides is None:
self.strides = strides
else:
self.strides = dims_neighbourhoods
self.ignore_border = ignore_border
self.inverse = inverse
......@@ -99,7 +99,7 @@ class NeighbourhoodsFromImages(Op):
def __str__(self):
return '%s{%s,%s,%s,%s}' % \
(self.__class__.__name__,
(self.__class__.__name__,
self.n_dims_before,
self.dims_neighbourhoods,
self.strides,
......@@ -135,7 +135,7 @@ class NeighbourhoodsFromImages(Op):
# the number of strides performed by NeighFromImg is
# directly given by this shape
num_strides.append(output_shape[self.n_dims_before + i])
# our Op's output image must be at least this wide
at_least_width = num_strides[i] * self.strides[i]
......@@ -231,7 +231,7 @@ class NeighbourhoodsFromImages(Op):
("for neigh_idx_%d in xrange(min(max_neigh_idx_%d,"\
+" self.dims_neighbourhoods[%d])):\n") % \
(inner_dim_no, inner_dim_no, inner_dim_no)
return code_before
def _py_flattened_idx(self):
......@@ -268,8 +268,8 @@ class NeighbourhoodsFromImages(Op):
class ImagesFromNeighbourhoods(NeighbourhoodsFromImages):
def __init__(self, n_dims_before, dims_neighbourhoods,
strides=None, ignore_border=False):
NeighbourhoodsFromImages.__init__(self,n_dims_before, dims_neighbourhoods,
strides=strides, ignore_border=ignore_border,
NeighbourhoodsFromImages.__init__(self,n_dims_before, dims_neighbourhoods,
strides=strides, ignore_border=ignore_border,
inverse=True)
# and that's all there is to it
......@@ -88,7 +88,7 @@ class Images2Neibs(Op):
PyErr_Format(PyExc_TypeError, "neib_step wrong step ; has to contain 2 elements");
%(fail)s;
}
// (c,d) = neib_shape
const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
......@@ -137,7 +137,7 @@ class Images2Neibs(Op):
* grid_d
* (%(ten4)s->dimensions)[1]
* (%(ten4)s->dimensions)[0];
if ((NULL == %(z)s)
|| ((%(z)s->dimensions)[0] != z_dim0 )
|| ((%(z)s->dimensions)[1] != z_dim1 )
......@@ -147,12 +147,12 @@ class Images2Neibs(Op):
npy_intp dims[2];
dims[0] = z_dim0;
dims[1] = z_dim1;
%(z)s = (PyArrayObject*) PyArray_EMPTY(2,
dims,
type_num_%(ten4)s,
0);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
......@@ -162,12 +162,12 @@ class Images2Neibs(Op):
}
{ // NESTED SCOPE
const int nb_batch = (%(ten4)s->dimensions)[0];
const int nb_stack = (%(ten4)s->dimensions)[1];
const int height = (%(ten4)s->dimensions)[2];
const int width = (%(ten4)s->dimensions)[3];
// (c,d) = neib_shape
const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
......@@ -177,7 +177,7 @@ class Images2Neibs(Op):
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
// Oh this is messed up...
// Oh this is messed up...
for (int n = 0; n < nb_batch; n++) // loop over batches
for (int s = 0; s < nb_stack; s++) // loop over stacks
for (int a = 0; a < grid_c; a++) // loop over the number of patch in height
......@@ -194,18 +194,18 @@ class Images2Neibs(Op):
}
for (int j = 0; j < d; j++) // loop over d
{
int ten4_3 = j + b * step_y;
int ten4_3 = j + b * step_y;
if ( "%(mode)s" == "wrap_centered" ){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
}
int z_col = j + d * i;
dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col);
*curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3));
//printf("\\n(%%i,%%i,%%i,%%i) --> (%%i,%%i)",n,s, ten4_2, ten4_3, z_row, z_col);
//printf("%%f ", *curr_z);
}
......@@ -220,22 +220,22 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
def neibs2images(neibs, neib_shape, original_shape):
"""
Inverse of images2neib.
neibs : matrix like the one obtained by images2neib
neib_shape : neib_shape that was used in images2neib
original_shape : original shape of the 4d tensor given to images2neib
Return a 4d tensor of shape `original_shape`.
"""
neibs = T.as_tensor_variable(neibs)
neib_shape = T.as_tensor_variable(neib_shape)
original_shape = T.as_tensor_variable(original_shape)
new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] )
return images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape).reshape(original_shape)
#return images2neibs(neibs.reshape((1,1,neibs.shape[0],neibs.shape[1])), new_neib_shape).reshape(original_shape)
# This is work in progress
class GpuImages2Neibs(Images2Neibs):
def __init__(self, mode='valid'):
......@@ -251,7 +251,7 @@ class GpuImages2Neibs(Images2Neibs):
assert ten4.ndim==4
assert neib_shape.ndim==1
assert neib_step.ndim==1
return Apply(self, [ten4, neib_shape, neib_step], [CudaNdarrayType(broadcastable=(False,False),
dtype=ten4.type.dtype)()])
......@@ -313,8 +313,8 @@ class GpuImages2Neibs(Images2Neibs):
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col + c*d*z_row;
......@@ -375,8 +375,8 @@ class GpuImages2Neibs(Images2Neibs):
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col + c*d*z_row;
......@@ -406,7 +406,7 @@ class GpuImages2Neibs(Images2Neibs):
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (%(neib_shape)s->dimensions[0] != 2)
{
PyErr_Format(PyExc_ValueError, "neib_shape has to contain two elements");
......@@ -459,7 +459,7 @@ class GpuImages2Neibs(Images2Neibs):
* grid_d
* CudaNdarray_HOST_DIMS(%(ten4)s)[1]
* CudaNdarray_HOST_DIMS(%(ten4)s)[0];
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
......@@ -475,11 +475,11 @@ class GpuImages2Neibs(Images2Neibs):
%(fail)s;
}
}
}
{ // NESTED SCOPE
const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
......@@ -489,11 +489,11 @@ class GpuImages2Neibs(Images2Neibs):
const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1);
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and a higher core utilisation.
//for smaller patch size
......@@ -519,7 +519,7 @@ class GpuImages2Neibs(Images2Neibs):
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>(
f<<<n_blocks, n_threads, n_shared>>>(
nb_batch,
nb_stack,
height, width,
......@@ -534,7 +534,7 @@ class GpuImages2Neibs(Images2Neibs):
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
......@@ -560,4 +560,4 @@ def use_gpu_images2neibs(node):
if cuda_available:
register_gpu_opt()(use_gpu_images2neibs)
......@@ -60,7 +60,7 @@ class TrueDot(gof.op.Op):
if self.grad_preserves_dense:
rval[1] = dense_from_sparse(rval[1])
return rval
def true_dot(x, y, grad_preserves_dense=True):
"""
@todo: Maybe the triple-transposition formulation (when x is dense)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论