adding sparse sandbox

7e84537f · James Bergstra · 24128a8f · 7e84537f · 7e84537f · 7e84537f
--- a/theano/sparse/sandbox/__init__.py
+++ b/theano/sparse/sandbox/__init__.py
--- a/theano/sparse/sandbox/sp.py
+++ b/theano/sparse/sandbox/sp.py
+"""
+Convolution-like operations with sparse matrix multiplication.
+To read about different sparse formats, see U{http://www-users.cs.umn.edu/~saad/software/SPARSKIT/paper.ps}.
+@todo: Automatic methods for determining best sparse format?
+"""
+#### COPIED FROM hpu/icml09/sp.py
+import numpy
+from scipy import sparse as scipy_sparse
+import theano
+import theano.sparse
+from theano import sparse, gof, Op, tensor
+from theano.printing import Print
+def register_specialize(lopt, *tags, **kwargs):
+    theano.compile.optdb['specialize'].register((kwargs and kwargs.pop('name')) or lopt.__name__, lopt, 'fast_run', *tags)
+class SpSum(Op):
+    """
+    Scale each columns of a sparse matrix by the corresponding element of a dense vector
+    """
+    axis = None
+    sparse_grad = False
+    def __init__(self, axis, sparse_grad):
+        """
+        :param sparse_grad: if True, this instance ignores the gradient on matrix elements which are implicitly 0.
+        """
+        super(SpSum, self).__init__()
+        self.axis=axis
+        self.sparse_grad = sparse_grad
+        if self.axis not in (None, 0, 1):
+            raise ValueError('illegal value for self.axis')
+    def __eq__(self, other):
+        #WARNING: judgement call...
+        #not using the sparse_grad in the comparison or hashing because it doesn't change the perform method
+        #therefore, we *do* want Sums with different sparse_grad values to be merged by the merge optimization.
+        # This requires them to compare equal.
+        return type(self) == type(other) and self.axis == other.axis
+    def __hash__(self):
+        return 76324 ^ hash(type(self)) ^ hash(self.axis)
+    def make_node(self, x):
+        ###
+        # At least for small matrices (5x5), the .sum() method of a csc matrix returns a dense matrix
+        # as the result whether axis is 0 or 1... weird!
+        ###
+        if self.axis is None:
+            z = tensor.tensor(broadcastable=(), dtype=x.dtype)
+        elif self.axis == 0:
+            if x.format == 'csc':
+                z = T.tensor(broadcastable=(False,), dtype=x.dtype)
+            elif x.format == 'csr':
+                #return SparseVector() #WRITEME!
+                raise NotImplementedError()
+            else:
+                raise NotImplementedError()
+        elif self.axis == 1:
+            if x.format == 'csc':
+                #return SparseVector() #WRITEME!
+                raise NotImplementedError()
+            elif x.format == 'csr':
+                z = T.tensor(broadcastable=(False,), dtype=x.dtype)
+            else:
+                raise NotImplementedError()
+        else:
+            assert False #axis should have been verified by self.__init__
+        return gof.Apply(self, [x], [z])
+    def perform(self,node, (x,), (z,)):
+        if self.axis is None:
+            z[0] = x.sum()
+        if self.axis == 0:
+            z[0] = numpy.asarray(x.sum(axis=0)).reshape((x.shape[1],))
+        if self.axis == 1:
+            z[0] = numpy.asarray(x.sum(axis=1)).reshape((x.shape[0],))
+    def grad(self,(x,), (gz,)):
+        if self.sparse_grad:
+            if self.axis is None:
+                return [gz * theano.sparse.sp_ones_like(x)]
+            elif self.axis == 0:
+                return col_scale(theano.sparse.sp_ones_like(x), gz)
+            elif self.axis == 1:
+                return row_scale(theano.sparse.sp_ones_like(x), gz)
+            else:
+                assert False
+        else:
+            raise NotImplementedError()
+def sp_sum(x, axis=None, sparse_grad=False):
+    return SpSum(axis, sparse_grad)(x)
+class Diag(Op):
+    """
+    Extract the diagonal of a square sparse matrix as a dense vector.
+    """
+    def make_node(self, x):
+        return gof.Apply(self, [x], [tensor.tensor(broadcastable=(False,), dtype=x.dtype)])
+    def perform(self, node, (x,), (z,)):
+        M, N = x.shape
+        if M != N:
+            raise ValueError("DenseDiag argument not square. Shape:", x.shape)
+        assert x.format == 'csc'
+        data = x.data
+        indices = x.indices
+        indptr = x.indptr
+        diag = numpy.zeros(N, x.dtype)
+        #TODO: try using ndarrays and then prune() on the result
+        for j in xrange(0, N):
+            for i_idx in xrange(indptr[j], indptr[j+1]):
+                if indices[i_idx] == j:
+                    diag[j] += data[i_idx]
+        z[0] = diag
+    def grad(self, (diag,), (gz,)):
+        return [square_diagonal(gz)]
+diag = Diag()
+class SquareDiagonal(Op):
+    """Return a square sparse (csc) matrix whose diagonal is given by the dense vector argument.
+    """
+    def make_node(self, diag):
+        return gof.Apply(self, [diag], 
+                [sparse.Sparse(dtype = diag.dtype,
+                                 format = 'csc').make_result()])
+    def perform(self, node, (diag,), (z,)):
+        N, = diag.shape
+        indptr = range(N+1)
+        indices = indptr[0:N]
+        z[0] = scipy_sparse.csc_matrix((diag, indices, indptr), (N,N), copy=True)
+    def grad(self, input, (gz,)):
+        return [diag(gz)]
+square_diagonal = SquareDiagonal()
+class ColScaleCSC(Op):
+    """
+    Scale each columns of a sparse matrix by the corresponding element of a dense vector
+    """
+    def make_node(self, x, s):
+        if x.format != 'csc':
+            raise ValueError('x was not a csc matrix')
+        return gof.Apply(self, [x, s], [x.type()])
+    def perform(self,node, (x,s), (z,)):
+        M, N = x.shape
+        assert x.format == 'csc'
+        assert s.shape == (N,)
+        y = x.copy()
+        for j in xrange(0, N):
+            y.data[y.indptr[j]: y.indptr[j+1]] *= s[j]
+        z[0] = y
+    def grad(self,(x,s), (gz,)):
+        return [col_scale(gz, s), sp_sum(x * gz, axis=0)]
+class RowScaleCSC(Op):
+    """
+    Scale each row of a sparse matrix by the corresponding element of a dense vector
+    """
+    def make_node(self, x, s):
+        return gof.Apply(self, [x, s], [x.type()])
+    def perform(self,node, (x,s), (z,)):
+        M, N = x.shape
+        assert x.format == 'csc'
+        assert s.shape == (M,)
+        indices = x.indices
+        indptr = x.indptr
+        y_data = x.data.copy()
+        for j in xrange(0, N):
+            for i_idx in xrange(indptr[j], indptr[j+1]):
+                y_data[i_idx] *= s[indices[i_idx]]
+        z[0] = scipy_sparse.csc_matrix((y_data, indices, indptr), (M,N))
+    def grad(self,(x,s), (gz,)):
+        return [row_scale(gz, s), sp_sum(x * gz, axis=0)]
+def col_scale(x, s):
+    if x.format == 'csc':
+        return ColScaleCSC()(x, s)
+    elif x.format == 'csr':
+        return RowScaleCSC()(x.T, s).T
+    else:
+        raise NotImplementedError()
+def row_scale(x, s):
+    return col_scale(x.T, s).T
+class Remove0(Op):
+    """
+    Remove explicit zeros from a sparse matrix, and resort indices
+    """
+    def make_node(self, x):
+        return gof.Apply(self, [x], [x.type()])
+    def perform(self,node, (x,), (z,)):
+        if x.format != 'csc':
+            raise TypeError('Remove0 only works on csc matrices')
+        M, N = x.shape
+        data = x.data
+        indices = x.indices
+        indptr = x.indptr
+        #TODO: try using ndarrays and then prune() on the result
+        new_data = []
+        new_indices = []
+        new_indptr = [0]
+        for j in xrange(0, N):
+            for i_idx in xrange(indptr[j], indptr[j+1]):
+                if data[i_idx] != 0:
+                    new_data.append(data[i_idx])
+                    new_indices.append(indices[i_idx])
+            new_indptr.append(len(new_indices))
+        z[0] = sparse.csc_matrix((new_data, new_indices, new_indptr), (M,N))
+    def grad(self, (x,), (gz,)):
+        return [gz]
+remove0 = Remove0()
+class EnsureSortedIndices(Op):
+    """
+    Remove explicit zeros from a sparse matrix, and resort indices
+    """
+    inplace=False
+    def __init__(self, inplace):
+        self.inplace=inplace
+        if self.inplace:
+            self.view_map = {0:[0]}
+    def make_node(self, x):
+        return gof.Apply(self, [x], [x.type()])
+    def perform(self,node, (x,), (z,)):
+        z[0] = x.ensure_sorted_indices(inplace=self.inplace)
+    def grad(self, (x,), (gz,)):
+        return [gz]
+ensure_sorted_indices = EnsureSortedIndices(inplace=False)
+def clean(x):
+    return ensure_sorted_indices(remove0(x))
+class ConvolutionIndices(Op):
+    """Build indices for a sparse CSC matrix that could implement A (convolve) B.
+       This generates a sparse matrix M, which generates a stack of image patches
+       when computing the dot product of M with image patch. Convolution is then
+       simply the dot product of (img x M) and the kernels.
+    """
+    @staticmethod
+    def sparse_eval(inshp, kshp, nkern, (dx,dy)=(1,1), mode='valid'):
+        return convolution_indices.evaluate(inshp,kshp,(dx,dy),nkern,mode=mode,ws=False)
+    @staticmethod
+    def conv_eval(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
+        return convolution_indices.evaluate(inshp,kshp,(dx,dy),mode=mode,ws=True)
+    # img_shape and ker_shape are (height,width)
+    @staticmethod
+    def evaluate(inshp, kshp, (dx,dy)=(1,1), nkern=1, mode='valid', ws=True):
+        """Build a sparse matrix which can be used for performing...
+        * convolution: in this case, the dot product of this matrix with the input
+          images will generate a stack of images patches. Convolution is then a 
+          tensordot operation of the filters and the patch stack.
+        * sparse local connections: in this case, the sparse matrix allows us to operate
+          the weight matrix as if it were fully-connected. The structured-dot with the
+          input image gives the output for the following layer.
+        @param ker_shape: shape of kernel to apply (smaller than image)
+        @param img_shape: shape of input images
+        @param mode: 'valid' generates output only when kernel and image overlap 
+            full' full convolution obtained by zero-padding the input
+        @param ws: True if weight sharing, false otherwise
+        @param (dx,dy): offset parameter. In the case of no weight sharing, gives the
+            pixel offset between two receptive fields. With weight sharing gives the 
+            offset between the top-left pixels of the generated patches
+        @rtype: tuple(indices, indptr, logical_shape, sp_type, out_img_shp)
+        @returns: the structure of a sparse matrix, and the logical dimensions of the image
+        which will be the result of filtering.
+        """
+        N = numpy
+        # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
+        # in the first case, default nfeatures to 1
+        if N.size(inshp)==2:
+            inshp = (1,)+inshp
+        inshp = N.array(inshp)
+        kshp  = N.array(kshp)
+        ksize = N.prod(kshp)
+        kern = ksize-1 - N.arange(ksize)
+        # size of output image if doing proper convolution (mode='full',dx=dy=0)
+        # outshp is the actual output shape given the parameters
+        fulloutshp = inshp[1:] + kshp - 1
+        s = -1 if mode=='valid' else 1
+        outshp = N.int64(N.ceil((inshp[1:] + s*kshp - s*1) \
+                 /N.array([dy,dx], dtype='float')))
+        if any(outshp <= 0):
+            err = 'Invalid kernel', kshp,'and/or step size',(dx,dy),\
+                  'for given input shape', inshp
+            raise ValueError(err)
+        outsize = N.prod(outshp)
+        insize = N.prod(inshp)
+        # range of output units over which to iterate
+        lbound = N.array([kshp[0]-1,kshp[1]-1]) if mode=='valid' else N.zeros(2)
+        ubound = lbound + (inshp[1:]-kshp+1) if mode=='valid' else fulloutshp
+        # coordinates of image in "fulloutshp" coordinates
+        topleft  = N.array([kshp[0]-1,kshp[1]-1])
+        botright = topleft + inshp[1:] # bound when counting the receptive field
+        # sparse matrix specifics...
+        spmatshp = (outsize*N.prod(kshp)*inshp[0],insize) if ws else\
+                   (nkern*outsize,insize)
+        spmat = scipy_sparse.lil_matrix(spmatshp)
+        # loop over output image pixels
+        z,zz = 0,0
+        # incremented every time we write something to the sparse matrix
+        # this is used to track the ordering of filter tap coefficient in sparse
+        # column ordering
+        tapi, ntaps = 0, 0
+        # Note: looping over the number of kernels could've been done more efficiently
+        # as the last step (when writing to spmat). However, this messes up the ordering
+        # of the column values (order in which you write the values determines how the
+        # vectorized data will get used later one)
+        for fmapi in range(inshp[0]): # loop over input features
+            for n in range(nkern): # loop over number of kernels (nkern=1 for weight sharing)
+                # FOR EACH OUTPUT PIXEL...
+                for oy in N.arange(lbound[0],ubound[0],dy): # loop over output image height
+                    for ox in N.arange(lbound[1],ubound[1],dx): # loop over output image width
+                        l = 0 # kern[l] is filter value to apply at (oj,oi) for (iy,ix)
+                        # ... ITERATE OVER INPUT UNITS IN RECEPTIVE FIELD
+                        for ky in oy+N.arange(kshp[0]):
+                            for kx in ox+N.arange(kshp[1]):
+                                # verify if we are still within image boundaries. Equivalent to
+                                # zero-padding of the input image
+                                if all((ky,kx) >= topleft) and all((ky,kx) < botright):
+                                    # convert to "valid" input space coords
+                                    # used to determine column index to write to in sparse mat
+                                    iy,ix = N.array((ky,kx)) - topleft
+                                    # determine raster-index of input pixel...
+                                    col = iy*inshp[2]+ix +\
+                                          fmapi*N.prod(inshp[1:]) # taking into account multiple input features
+                                    # convert oy,ox values to output space coordinates
+                                    (y,x) = (oy,ox) if mode=='full' else (oy,ox) - topleft
+                                    (y,x) = N.array([y,x]) / (dy,dx) # taking into account step size
+                                    # convert to row index of sparse matrix
+                                    row = (y*outshp[1]+x)*inshp[0]*ksize + l + fmapi*ksize if ws else\
+                                          y*outshp[1] + x
+                                    # Store something at that location in sparse matrix.
+                                    # The written value is only useful for the sparse case. It
+                                    # will determine the way kernel taps are mapped onto
+                                    # the sparse columns (idea of kernel map)
+                                    spmat[row + n*outsize, col] = tapi + 1   # n*... only for sparse
+                                    # total number of active taps (used for kmap)
+                                    ntaps += 1
+                                tapi += 1 # absolute tap index (total number of taps)
+                                l+=1 # move on to next filter tap l=(l+1)%ksize
+        if spmat.format != 'csc':
+            spmat = spmat.tocsc().ensure_sorted_indices()
+        else:
+            # BUG ALERT: scipy0.6 has bug where data and indices are written in reverse column
+            # ordering. Explicit call to ensure_sorted_indices removes this problem
+            spmat = spmat.ensure_sorted_indices()
+        if ws:
+            kmap = None
+        else:
+            kmap = N.zeros(ntaps, dtype='int')
+            k=0
+            #print 'TEMPORARY BUGFIX: REMOVE !!!'
+            for j in xrange(spmat.shape[1]):
+                for i_idx in xrange(spmat.indptr[j], spmat.indptr[j+1]):
+                    if spmat.data[i_idx] != 0:
+                        kmap[k] = spmat.data[i_idx] -1 # this is == spmat[i,j] - 1
+                        k+=1
+        # when in valid mode, it is more efficient to store in sparse row
+        # TODO: need to implement structured dot for csr matrix
+        assert spmat.format == 'csc'
+        sptype = 'csc'
+        #sptype = 'csr' if mode=='valid' else 'csc'
+        if 0 and mode=='valid':
+            spmat = spmat.tocsr()
+        rval = (spmat.indices[:spmat.size],
+                spmat.indptr, spmatshp, sptype, outshp)
+        rval += (kmap,) if kmap!=None else ()
+        return rval
+    def perform(self, node, (inshp, kshp),\
+                (out_indices, out_indptr, spmat_shape)):
+        indices, indptr, spmatshp, outshp = self.evaluate(inshp, kshp)
+        out_indices[0] = indices
+        out_indptr[0] = indptr
+        spmat_shape[0] = N.asarray(spmatshp)
+convolution_indices = ConvolutionIndices()
+def applySparseFilter(kerns, kshp, nkern, images, imgshp, step=(1,1), bias=None, mode='valid'):
+    """
+     === Input / Output conventions===
+    "images" is assumed to be a matrix of shape batch_size x img_size, where the second
+    dimension represents each image in raster order
+    Output feature map will have shape: 
+       batch_size x number of kernels * output_size
+    IMPORTANT: note that this means that each feature map is contiguous in memory. 
+               The memory layout will therefore be:
+               [ <feature_map_0> <feature_map_1> ... <feature_map_n>],
+               where <feature_map> represents a "feature map" in raster order
+    Note that the concept of feature map doesn't really apply to sparse filters without
+    weight sharing. Basically, nkern=1 will generate one output img/feature map, 
+    nkern=2 a second feature map, etc.
+    kerns is a 1D tensor, and assume to be of shape:
+       nkern * N.prod(outshp) x N.prod(kshp)
+    Each filter is applied seperately to consecutive output pixels.
+    @param kerns: nkern*outsize*ksize vector containing kernels
+    @param kshp: tuple containing actual dimensions of kernel (not symbolic)
+    @param nkern: number of kernels to apply at each pixel in the input image. 
+                  nkern=1 will apply a single unique filter for each input pixel. 
+    @param images: bsize x imgsize matrix containing images on which to apply filters
+    @param imgshp: tuple containing actual image dimensions (not symbolic)
+    @param step: determines number of pixels between adjacent receptive fields 
+                 (tuple containing dx,dy values)
+    @param mode: 'full', 'valid' see CSM.evaluate function for details
+    @output out1: symbolic result
+    @output out2: logical shape of the output img (nkern,height,width)
+                  (after dot product, not of the sparse matrix!)
+    """
+    # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
+    # in the first case, default nfeatures to 1
+    if numpy.size(imgshp)==2:
+        imgshp = (1,)+imgshp
+    # construct indices and index pointers for sparse matrix
+    indices, indptr, spmat_shape, sptype, outshp, kmap = \
+        convolution_indices.sparse_eval(imgshp, kshp, nkern, step, mode)
+    # build a sparse weight matrix
+    sparsew = theano.sparse.CSM(sptype, kmap)(kerns, indices, indptr, spmat_shape)
+    output =  sparse.structured_dot(sparsew, images.T).T
+    if bias is not None:
+        output += bias
+    return output, numpy.hstack((nkern,outshp))
+def convolve(kerns, kshp, nkern, images, imgshp, step=(1,1), bias=None,\
+             mode='valid', flatten=True):
+    """Convolution implementation by sparse matrix multiplication.
+    @note: For best speed, put the matrix which you expect to be smaller as the 'kernel'
+    argument
+    === Input / Output conventions===
+    "images" is assumed to be a matrix of shape batch_size x img_size, where the second
+    dimension represents each image in raster order
+    If flatten is "False", the output feature map will have shape:
+        batch_size x number of kernels x output_size
+    If flatten is "True", the output feature map will have shape:
+        batch_size x number of kernels * output_size
+    IMPORTANT: note that this means that each feature map (image generate by each 
+               kernel) is contiguous in memory. The memory layout will therefore be:
+               [ <feature_map_0> <feature_map_1> ... <feature_map_n>], 
+               where <feature_map> represents a "feature map" in raster order
+    kerns is a 2D tensor of shape nkern x N.prod(kshp)
+    @param kerns: 2D tensor containing kernels which are applied at every pixel
+    @param kshp: tuple containing actual dimensions of kernel (not symbolic)
+    @param nkern: number of kernels/filters to apply.
+                  nkern=1 will apply one common filter to all input pixels
+    @param images: tensor containing images on which to apply convolution
+    @param imgshp: tuple containing image dimensions
+    @param step: determines number of pixels between adjacent receptive fields 
+                 (tuple containing dx,dy values)
+    @param mode: 'full', 'valid' see CSM.evaluate function for details
+    @param sumdims: dimensions over which to sum for the tensordot operation. By default
+    ((2,),(1,)) assumes kerns is a nkern x kernsize matrix and images is a batchsize x 
+    imgsize matrix containing flattened images in raster order
+    @param flatten: flatten the last 2 dimensions of the output. By default, instead of
+    generating a batchsize x outsize x nkern tensor, will flatten to 
+    batchsize x outsize*nkern
+    @output out1: symbolic result
+    @output out2: logical shape of the output img (nkern,heigt,width)
+    @TODO: test for 1D and think of how to do n-d convolutions
+    """
+    N = numpy
+    # start by computing output dimensions, size, etc
+    kern_size = N.int64(N.prod(kshp))
+    # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
+    # in the first case, default nfeatures to 1
+    if N.size(imgshp)==2:
+        imgshp = (1,)+imgshp
+    # construct indices and index pointers for sparse matrix, which, when multiplied
+    # with input images will generate a stack of image patches
+    indices, indptr, spmat_shape, sptype, outshp = \
+            convolution_indices.conv_eval(imgshp, kshp, step, mode)
+    # build sparse matrix, then generate stack of image patches
+    csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape)
+    patches = (sparse.structured_dot(csc, images.T)).T
+    # compute output of linear classifier
+    pshape = tensor.stack(images.shape[0] * tensor.as_tensor(N.prod(outshp)),\
+                          tensor.as_tensor(imgshp[0]*kern_size))
+    patch_stack = tensor.reshape(patches, pshape, ndim=2);
+    # kern is of shape: nkern x ksize*number_of_input_features
+    # output is thus of shape: bsize*outshp x nkern
+    output = tensor.dot(patch_stack,kerns.T)
+    # add bias across each feature map (more efficient to do it now)
+    if bias is not None: 
+        output += bias
+    # now to have feature maps in raster order ...
+    # go from bsize*outshp x nkern to bsize x nkern*outshp
+    newshp = tensor.stack(images.shape[0],\
+                          tensor.as_tensor(N.prod(outshp)),\
+                          tensor.as_tensor(nkern))
+    tensout= tensor.reshape(output, newshp, ndim=3)
+    output = tensor.DimShuffle((False,)*tensout.ndim, (0,2,1))(tensout)
+    if flatten:
+        output = tensor.flatten(output, 2)
+    return output, N.hstack((nkern,outshp))
+def max_pool(images, imgshp, maxpoolshp):
+    """Implements a max pooling layer
+    Takes as input a 2D tensor of shape batch_size x img_size and performs max pooling.
+    Max pooling downsamples by taking the max value in a given area, here defined by
+    maxpoolshp. Outputs a 2D tensor of shape batch_size x output_size.
+    @param images: 2D tensor containing images on which to apply convolution.
+                   Assumed to be of shape batch_size x img_size
+    @param imgshp: tuple containing image dimensions
+    @param maxpoolshp: tuple containing shape of area to max pool over
+    @output out1: symbolic result (2D tensor)
+    @output out2: logical shape of the output
+    """
+    N = numpy
+    poolsize = N.int64(N.prod(maxpoolshp))
+    # imgshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
+    # in the first case, default nfeatures to 1
+    if N.size(imgshp)==2:
+        imgshp = (1,)+imgshp
+    # construct indices and index pointers for sparse matrix, which, when multiplied
+    # with input images will generate a stack of image patches
+    indices, indptr, spmat_shape, sptype, outshp = \
+            convolution_indices.conv_eval(imgshp, maxpoolshp, maxpoolshp, mode='valid')
+    print 'XXXXXXXXXXXXXXXX MAX POOLING LAYER XXXXXXXXXXXXXXXXXXXX'
+    print 'imgshp = ', imgshp
+    print 'maxpoolshp = ', maxpoolshp
+    print 'outshp = ', outshp
+    # build sparse matrix, then generate stack of image patches
+    csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape)
+    patches = sparse.structured_dot(csc, images.T).T
+    pshape = tensor.stack(images.shape[0]*\
+                            tensor.as_tensor(N.prod(outshp)),
+                          tensor.as_tensor(imgshp[0]),
+                          tensor.as_tensor(poolsize))
+    patch_stack = tensor.reshape(patches, pshape, ndim=3);
+    out1 = tensor.max(patch_stack, axis=2)
+    pshape = tensor.stack(images.shape[0],
+                          tensor.as_tensor(N.prod(outshp)),
+                          tensor.as_tensor(imgshp[0]))
+    out2 = tensor.reshape(out1, pshape, ndim=3);
+    out3 = tensor.DimShuffle((False,)*3, (0,2,1))(out2)
+    return tensor.flatten(out3,2), outshp
--- a/theano/sparse/sandbox/test_sp.py
+++ b/theano/sparse/sandbox/test_sp.py
+import sys
+from theano import function, Mode
+from theano.gof import OpWiseCLinker
+import theano, numpy
+import theano.tensor as T
+import theano.sparse
+import scipy.sparse
+from scipy.signal import convolve2d
+import scipy.sparse as sparse
+import numpy
+import numpy as N
+from theano.sparse.sandbox import sp
+import unittest
+import time
+class TestSP(unittest.TestCase):
+    def test_convolution(self):
+        print '\n\n*************************************************'
+        print '           TEST CONVOLUTION' 
+        print '*************************************************'
+        # fixed parameters
+        bsize = 10     # batch size
+        imshp = (28,28)
+        kshp = (5,5)
+        nkern = 5
+        ssizes = ((1,1),(2,2),(3,3),(4,4))
+        convmodes = ('full','valid')
+        # symbolic stuff
+        bias = T.dvector()
+        kerns = T.dmatrix()
+        input = T.dmatrix()
+        rng = N.random.RandomState(3423489)
+        filters = rng.randn(nkern,N.prod(kshp))
+        biasvals = rng.randn(nkern)
+        for mode in ('FAST_COMPILE','FAST_RUN'): #, profmode):
+            ttot, ntot = 0, 0
+            for conv_mode in convmodes:
+                for ss in ssizes:
+                    output, outshp  = sp.convolve(kerns, kshp, nkern, input,\
+                            imshp, ss, bias=bias, mode=conv_mode)
+                    f = function([kerns, bias, input], output, mode=mode)
+                    # now test with real values
+                    img2d = N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
+                    img1d = img2d.reshape(bsize,-1)
+                    # create filters (need to be flipped to use convolve2d)
+                    filtersflipped = N.zeros((nkern,)+kshp)
+                    for k in range(nkern):
+                        it = reversed(filters[k,:])
+                        for i in range(kshp[0]):
+                            for j in range(kshp[1]):
+                                filtersflipped[k,i,j] = it.next()
+                    # compute output with convolve2d
+                    fulloutshp = N.array(imshp) - N.array(kshp) + 1 if conv_mode=='valid'\
+                             else N.array(imshp) + N.array(kshp) - 1
+                    ntime1 = time.time()
+                    refout = N.zeros((bsize,)+tuple(fulloutshp)+(nkern,))
+                    for b in range(bsize):
+                        for n in range(nkern):
+                            refout[b,...,n] = convolve2d(\
+                                    img2d[b,:,:], filtersflipped[n,...],conv_mode)
+                    ntot += time.time() - ntime1
+                    # need to flatten images
+                    bench1 = refout[:,0::ss[0],0::ss[1],:].reshape(bsize,-1,nkern)
+                    bench1 += biasvals.reshape(1,1,nkern)
+                    # swap the last two dimensions (output needs to be nkern x outshp)
+                    bench1 = N.swapaxes(bench1,1,2)
+                    ttime1 = time.time()
+                    out1 = f(filters, biasvals, img1d)
+                    ttot += time.time() - ttime1
+                    temp = bench1.flatten() - out1.flatten()
+                    assert (temp < 1e-5).all()
+                    # test downward propagation -- symbolic stuff
+                    #vis = T.grad(output, input, output)
+                    #downprop = function([kerns,input], vis, mode=mode)
+                    #visval = downprop(filters,img1d)
+                    ## test downward propagation -- reference implementation
+                    #pshape = (img1d.shape[0],N.prod(outshp[1:]),N.prod(kshp))
+                    #patchstack = N.zeros(pshape)
+                    #for bi in N.arange(pshape[0]): # batch index
+                        #abspos = 0
+                        #for outy in N.arange(outshp[1]): 
+                            #for outx in N.arange(outshp[2]): 
+                                #for ni in N.arange(nkern):
+                                    #print 'filters[n,:].shape = ', filters[n,:].shape
+                                    #print 'out1[bi,abspos].shape =',out1[bi,abspos].shape
+                                    #patchstack[bi,abspos,:] = filters[n,:]*out1[bi,abspos]
+                                    #abspos+=1
+                    #patchstack = patchstack.reshape(1,-1)
+                    #indices, indptr, spmat_shape, sptype, outshp = \
+                            #sp.convolution_indices.conv_eval(imshp,kshp,ss,conv_mode)
+                    #spmat = sparse.csc_matrix((N.ones_like(indices),indices,indptr),spmat_shape)
+                    #visref = N.dot(patchstack, spmat.todense())
+                    #print 'visval = ', visval
+                    #print 'visref = ', visref
+                    #assert N.all(visref==visval)
+            print '**** Convolution Profiling Results (',mode,') ****'
+            print 'Numpy processing time: ', ntot
+            print 'Theano processing time: ', ttot
+        #profmode.print_summary()
+    def test_sparse(self):
+        print '\n\n*************************************************'
+        print '           TEST SPARSE' 
+        print '*************************************************'
+        # fixed parameters
+        bsize = 10     # batch size
+        imshp = (28,28)
+        kshp = (5,5)
+        nkern = 1 # per output pixel
+        ssizes = ((1,1),(2,2))
+        convmodes = ('full','valid',)
+        # symbolic stuff
+        bias = T.dvector()
+        kerns = T.dvector()
+        input = T.dmatrix()
+        rng = N.random.RandomState(3423489)
+        import theano.gof as gof
+        #Mode(optimizer='fast_run', linker=gof.OpWiseCLinker(allow_gc=False)),): 
+        for mode in ('FAST_COMPILE','FAST_RUN'): #,profmode):
+            ntot, ttot = 0,0
+            for conv_mode in convmodes:
+                for ss in ssizes:
+                    output, outshp = sp.applySparseFilter(kerns, kshp,\
+                            nkern, input, imshp, ss, bias=bias, mode=conv_mode)
+                    f = function([kerns, bias, input], output, mode=mode)
+                    # build actual input images
+                    img2d = N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
+                    img1d = img2d.reshape(bsize,-1)
+                    zeropad_img = N.zeros((bsize,\
+                                           img2d.shape[1]+2*(kshp[0]-1),\
+                                           img2d.shape[2]+2*(kshp[1]-1)))
+                    zeropad_img[:, kshp[0]-1:kshp[0]-1+img2d.shape[1],
+                                   kshp[1]-1:kshp[1]-1+img2d.shape[2]] = img2d
+                    # build kernel matrix -- flatten it for theano stuff
+                    filters = N.arange(N.prod(outshp)*N.prod(kshp)).\
+                                reshape(nkern,N.prod(outshp[1:]),N.prod(kshp))
+                    spfilt = filters.flatten()
+                    biasvals = N.arange(N.prod(outshp))
+                    # compute output by hand
+                    ntime1 = time.time()
+                    refout = N.zeros((bsize,nkern,outshp[1],outshp[2]))
+                    patch = N.zeros((kshp[0],kshp[1]))
+                    for b in xrange(bsize):
+                        for k in xrange(nkern):
+                            pixi = 0 # pixel index in raster order
+                            for j in xrange(outshp[1]):
+                                for i in xrange(outshp[2]):
+                                    n = j * ss[0]
+                                    m = i * ss[1]
+                                    patch = zeropad_img[b,n:n+kshp[0],m:m+kshp[1]]
+                                    refout[b,k,j,i] = N.dot(filters[k,pixi,:],\
+                                                            patch.flatten())
+                                    pixi += 1
+                    refout = refout.reshape(bsize,-1) + biasvals
+                    ntot += time.time() - ntime1
+                    # need to flatten images
+                    ttime1 = time.time()
+                    out1 = f(spfilt, biasvals, img1d)
+                    ttot += time.time() - ttime1
+                    temp = refout - out1
+                    assert (temp < 1e-10).all()
+                    # test downward propagation
+                    vis = T.grad(output, input, output)
+                    downprop = function([kerns,output], vis)
+                    temp1 = time.time()
+                    for zz in range(100):
+                        visval = downprop(spfilt,out1)
+                    indices, indptr, spmat_shape, sptype, outshp, kmap = \
+                            sp.convolution_indices.sparse_eval(imshp,kshp,nkern,ss,conv_mode)
+                    spmat = sparse.csc_matrix((spfilt[kmap],indices,indptr),spmat_shape)
+                    visref = N.dot(out1,spmat.todense())
+                    assert N.all(visref==visval)
+            print '**** Sparse Profiling Results (',mode,') ****'
+            print 'Numpy processing time: ', ntot
+            print 'Theano processing time: ', ttot
+        #profmode.print_summary()
+    def test_multilayer_sparse(self):
+        # fixed parameters
+        bsize = 10     # batch size
+        imshp = (5,5)
+        kshp = ((3,3),(2,2))
+        nkerns = (10,20) # per output pixel
+        ssizes = ((1,1),(2,2))
+        convmodes = ('full','valid',)
+        # symbolic stuff
+        kerns = [T.dvector(),T.dvector()]
+        input = T.dmatrix()
+        rng = N.random.RandomState(3423489)
+        # build actual input images
+        img2d = N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
+        img1d = img2d.reshape(bsize,-1)
+        for mode in ('FAST_COMPILE','FAST_RUN'):
+            for conv_mode in convmodes:
+                for ss in ssizes:
+                    l1hid, l1outshp = sp.applySparseFilter(kerns[0], kshp[0],\
+                            nkerns[0], input, imshp, ss, mode=conv_mode)
+                    l2hid, l2outshp = sp.applySparseFilter(kerns[1], kshp[1],\
+                            nkerns[1], l1hid, l1outshp, ss, mode=conv_mode)
+                    l1propup = function([kerns[0], input], l1hid, mode=mode)
+                    l2propup = function([kerns[1], l1hid], l2hid, mode=mode)
+                    # actual values
+                    l1kernvals = N.arange(N.prod(l1outshp)*N.prod(kshp[0]))
+                    l2kernvals = N.arange(N.prod(l2outshp)*N.prod(kshp[1])*nkerns[0])
+                    l1hidval = l1propup(l1kernvals,img1d)
+                    l2hidval = l2propup(l2kernvals,l1hidval)
+    # this doesn't compare the output of anything... but I manually verified that the patches
+    # are properly generated
+    def test_multilayer_conv(self):
+        # fixed parameters
+        bsize = 10     # batch size
+        imshp = (5,5)
+        kshp = ((3,3),(2,2))
+        nkerns = (3,6) # per output pixel
+        ssizes = (((1,1),(2,2)),)
+        convmodes = ('full',)#'valid',)
+        # symbolic stuff
+        kerns = [T.dmatrix(),T.dmatrix()]
+        input = T.dmatrix()
+        rng = N.random.RandomState(3423489)
+        # build actual input images
+        img2d = N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
+        img1d = img2d.reshape(bsize,-1)
+        for mode in ('FAST_COMPILE','FAST_RUN'):
+            for conv_mode in convmodes:
+                for ss in ssizes:
+                    l1hid, l1shp = sp.convolve(kerns[0], kshp[0],\
+                            nkerns[0], input, imshp, ss[0], mode=conv_mode)
+                    l1propup = function([kerns[0], input], l1hid, mode=mode)
+                    #l1kernvals = N.random.rand(nkerns[0],N.prod(kshp[0]))
+                    l1kernvals = N.arange(nkerns[0]*N.prod(kshp[0])).reshape(nkerns[0],N.prod(kshp[0]))
+                    l1hidval = l1propup(l1kernvals,img1d)
+                    # actual values
+                    l2hid, l2shp = sp.convolve(kerns[1], kshp[1],\
+                            nkerns[1], l1hid, l1shp, ss[1], mode=conv_mode)
+                    l2propup = function([kerns[1], l1hid], l2hid, mode=mode)
+                    #l2kernvals = N.random.rand(nkerns[1],N.prod(kshp[1])*nkerns[0])
+                    l2kernvals = N.arange(nkerns[1]*N.prod(kshp[1])*nkerns[0]).reshape(nkerns[1],N.prod(kshp[1])*nkerns[0])
+                    # for debugging, we bring things back to integers
+                    l1hidval = N.arange(N.size(l1hidval)).reshape(l1hidval.shape)
+                    l2hidval = l2propup(l2kernvals,l1hidval)
+    def test_maxpool(self):
+        # generate flatted images
+        maxpoolshps = ((2,2),(3,3),(4,4),(5,5),(6,6))
+        imval = N.random.rand(4,5,10,10)
+        images = T.dmatrix()
+        for maxpoolshp in maxpoolshps:
+            # symbolic stuff
+            output, outshp = sp.max_pool(images, imval.shape[1:], maxpoolshp)
+            f = function([images,],[output,])
+            output_val = f(imval.reshape(imval.shape[0],-1))
+            # numeric verification
+            my_output_val = N.zeros((imval.shape[0], imval.shape[1],
+                                     imval.shape[2]/maxpoolshp[0],
+                                     imval.shape[3]/maxpoolshp[1]))
+            assert N.prod(my_output_val.shape[1:]) == N.prod(N.r_[imval.shape[1],outshp])
+            for n in range(imval.shape[0]):
+                for k in range(imval.shape[1]):
+                    for i in range(imval.shape[2]/maxpoolshp[0]):
+                        for j in range(imval.shape[3]/maxpoolshp[1]):
+                            ii,jj = i*maxpoolshp[0], j*maxpoolshp[1]
+                            patch = imval[n,k,ii:ii+maxpoolshp[0],jj:jj+maxpoolshp[1]]
+                            my_output_val[n,k,i,j] = N.max(patch)
+            my_output_val = my_output_val.reshape(imval.shape[0],-1)
+            assert N.all(output_val == my_output_val)
+            def mp(input):
+                output, outshp = sp.max_pool(input, imval.shape[1:], maxpoolshp)
+                return output
+            T.verify_grad(None, mp, [imval.reshape(imval.shape[0],-1)])
+    def test_CSMGrad(self):
+        imshp = (3,3)
+        nkern = 1 # per output pixel
+        kshp = (2,2)
+        #ssizes = ((1,1),(2,2))
+        ssizes = ((1,1),)
+        #convmodes = ('full','valid',)
+        convmodes = ('full',)
+        kerns = T.dvector()
+        indices = T.ivector()
+        indptr = T.ivector()
+        spmat_shape = T.ivector()
+        def d(kerns, indices, indptr, spmat_shape):
+            return theano.sparse.dense_from_sparse(\
+                    theano.sparse.CSM(sptype,kmap)(kerns,indices,indptr,spmat_shape))
+        for mode in ['FAST_COMPILE','FAST_RUN']:
+            for conv_mode in convmodes:
+                for ss in ssizes:
+                    indvals, indptrvals, spshapevals, sptype, outshp, kmap = \
+                            sp.convolution_indices.sparse_eval(imshp,kshp,nkern,ss,conv_mode)
+                    kvals = N.random.random(nkern*N.prod(kshp)*N.prod(outshp)).flatten()
+                    # symbolic stuff
+                    T.verify_grad(None, d,\
+                            [kvals, indvals, indptrvals,spshapevals])
+def test_diagonal():
+    for K in 1, 5:
+        d = T.ivector()
+        sd = sp.square_diagonal(d)
+        f = theano.function([d], sd)
+        n = N.zeros((K,K), dtype='int32')
+        for i in range(K):
+            n[i,i] = i
+        assert N.all(n == f(range(K)).toarray())
+def test_diagonal_grad():
+    def d(x):
+        return sp.sp_sum(sp.square_diagonal(x), sparse_grad=True)
+    T.verify_grad(None, d, [[0.0, 0.1, 0.2, 0.3]], 
+            mode=theano.Mode(linker='py', optimizer='fast_compile'))
+def test_row_scale():
+    x = theano.sparse.csc_matrix()
+    s = theano.tensor.dvector()
+    def d(x,s):
+        return sp.sp_sum(sp.row_scale(x, s), sparse_grad=True)
+    rng = numpy.random.RandomState(8723)
+    R = 5
+    C = 8
+    x_val_dense = numpy.zeros((R, C),dtype='d')
+    for idx in [(0,0), (4, 1), (2,1), (3, 3), (4, 4), (3, 7), (2, 7)]:
+        x_val_dense.__setitem__(idx, rng.randn())
+    x_val = scipy.sparse.csc_matrix(x_val_dense)
+    s_val = rng.randn(R)
+    f = theano.function([x, s], sp.row_scale(x, s))
+    print 'A', f(x_val, s_val).toarray()
+    print 'B', (x_val_dense.T * s_val).T
+    assert numpy.all(f(x_val, s_val).toarray() == (x_val_dense.T * s_val).T)
+    if 0:
+        T.verify_grad(None, d, [x_val, s_val],
+                mode=theano.Mode(linker='py', optimizer='fast_compile'))
+    else:
+        print >> sys.stderr, "WARNING: skipping gradient test because verify_grad doesn't support sparse arguments"
+def test_col_scale():
+    x = theano.sparse.csc_matrix()
+    s = theano.tensor.dvector()
+    def d(x,s):
+        return sp.sp_sum(sp.col_scale(x, s), sparse_grad=True)
+    rng = numpy.random.RandomState(8723)
+    R = 5
+    C = 8
+    x_val_dense = numpy.zeros((R, C),dtype='d')
+    for idx in [(0,0), (4, 1), (2,1), (3, 3), (4, 4), (3, 7), (2, 7)]:
+        x_val_dense.__setitem__(idx, rng.randn())
+    x_val = scipy.sparse.csc_matrix(x_val_dense)
+    s_val = rng.randn(C)
+    f = theano.function([x, s], sp.col_scale(x, s))
+    print 'A', f(x_val, s_val).toarray()
+    print 'B', (x_val_dense * s_val)
+    assert numpy.all(f(x_val, s_val).toarray() == (x_val_dense * s_val))
+    if 0:
+        T.verify_grad(None, d, [x_val, s_val],
+                mode=theano.Mode(linker='py', optimizer='fast_compile'))
+    else:
+        print >> sys.stderr, "WARNING: skipping gradient test because verify_grad doesn't support sparse arguments"
+if __name__ == '__main__':
+    if 0:
+        testcase =  TestSP
+        suite = unittest.TestLoader()
+        suite = suite.loadTestsFromTestCase(testcase)
+        unittest.TextTestRunner(verbosity=2).run(suite)
+    else:
+        unittest.main()