Merge pull request #1580 from nouiz/deprecated

[MRG]Deprecated

Merge pull request #1580 from nouiz/deprecated
9950ce08 · Pascal Lamblin · 1d639d66 · 7c42bebe · 9950ce08 · 9950ce08
--- a/MANIFEST.in
+++ b/MANIFEST.in
 global-include *.txt
+global-include *.c
 global-include *.cu
 global-include *.cuh
 global-include *.sh

--- a/doc/extending/fibby.txt
+++ b/doc/extending/fibby.txt
@@ -67,9 +67,9 @@ you should check the strides and alignment.
        if (!%(y)s)
            %(fail)s;
        {//New scope needed to make compilation work
-            dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data;
+            dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-            dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data;
+            dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-            for (int i = 2; i < %(x)s->dimensions[0]; ++i)
+            for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
                y[i] = y[i-1]*y[i-2] + x[i];
        }
      """ % locals()

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -420,7 +420,9 @@ TensorVariable
 .. class:: _tensor_py_operators(object)
-    This mix-in class adds convenient attributes, methods, and support for Python operators (see :ref:`tensor_operator_support`).
+    This mix-in class adds convenient attributes, methods, and support
+    to TensorVariable, TensorConstant and TensorSharedVariable for
+    Python operators (see :ref:`tensor_operator_support`).
    .. attribute:: type
@@ -472,6 +474,10 @@ TensorVariable
        See :func:`flatten`.
+    .. method:: ravel()
+        return self.flatten(). For NumPy compatibility.
    .. attribute:: T
        Transpose of this tensor.
@@ -485,8 +491,31 @@ TensorVariable
            same vector!  Use `reshape` or `dimshuffle` to turn your vector
            into a row or column matrix.
+    .. method:: {any,all}(axis=None, keepdims=False)
+    .. method:: {sum,prod,mean}(axis=None, dtype=None, keepdims=False, acc_dtype=None)
+    .. method:: {var,std,min,max,argmin,argmax}(axis=None, keepdims=False),
+    .. method:: diagonal(offset=0, axis1=0, axis2=1)
+    .. method:: astype(dtype)
+    .. method:: take(indices, axis=None, mode='raise')
+    .. method:: copy()
+    .. method:: norm(L, axis=None)
+    .. method:: nonzero(self, return_matrix=False)
+    .. method:: nonzero_values(self)
+    .. method:: sort(self, axis=-1, kind='quicksort', order=None)
+    .. method:: argsort(self, axis=-1, kind='quicksort', order=None)
+    .. method:: clip(self, a_min, a_max)
+    .. method:: conf()
+    .. method:: repeat(repeats, axis=None)
+    .. method:: round(mode="half_away_from_zero")
+    .. method:: trace()
+    .. method:: get_scalar_constant_value()
+    .. method:: zeros_like(model, dtype=None)
+       All the above methods are equivalent to NumPy for Theano on the current tensor.
+    .. method:: __{abs,neg,lt,le,gt,ge,invert,and,or,add,sub,mul,div,truediv,floordiv}__
+       Those elemwise operation are supported via Python syntax.
 Shaping and Shuffling
 =====================

--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -155,11 +155,11 @@ class WeirdBrokenOp(gof.Op):
        prep_vars = """
            //the output array has size M x N
            npy_intp M = PyArray_DIMS(%(a)s)[0];
-            npy_intp Sa = %(a)s->strides[0] / PyArray_DESCR(%(a)s)->elsize;
+            npy_intp Sa = PyArray_STRIDES(%(a)s)[0] / PyArray_DESCR(%(a)s)->elsize;
-            npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_double * Da = (npy_double*)%(a)s->data;
+            npy_double * Da = (npy_double*)PyArray_BYTES(%(a)s);
-            npy_double * Dz = (npy_double*)%(z)s->data;
+            npy_double * Dz = (npy_double*)PyArray_BYTES(%(z)s);
            //clear the output array
            for (npy_intp m = 0; m < M; ++m)

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1693,7 +1693,7 @@ class GCC_compiler(object):
        #to use the new API, but not everywhere. When finished, enable
        #the following macro to assert that we don't bring new code
        #that use the old API.
-        #cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
+        cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
        # numpy 1.7 deprecated the following macro but the new one didn't

--- a/theano/gof/lazylinker_c.c.txt
+++ b/theano/gof/lazylinker_c.c.txt
--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
@@ -76,10 +76,7 @@ except ImportError:
        except ImportError:
            _logger.info("Compiling new CVM")
            dirname = 'lazylinker_ext'
-            # We use a .txt extensions as otherwise it don't get
+            cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
-            # included when we create a package to send to pypi
-            # This happen even if we tell to include *.c files
-            cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c.txt')
            code = open(cfile).read()
            loc = os.path.join(config.compiledir, dirname)
            if not os.path.exists(loc):

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -220,6 +220,7 @@ if __name__ == "__main__":
        GTX 650 Ti               0.27s
        GTX 460                  0.37s                0.45s
        GTX 285           0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
+        750M                     0.49s
        GTX 550 Ti                                    0.57s
        GT 520                          2.68s                3.06s
        520M                     2.44s                       3.19s        # with bumblebee on Ubuntu 12.04

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2223,12 +2223,6 @@ class GpuReshape(tensor.Reshape, GpuOp):
        out[0] = x.reshape(tuple(shp))
-# C Code shared by GpuSubtensor and GpuIncSubtensor
-_define_set_data = """
-    #define CudaNdarray_set_device_data2(obj, ptr, base) \
-            CudaNdarray_set_device_data(obj, (float *)ptr, base)
-"""
 class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
@@ -2276,16 +2270,27 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
        view_ndim = node.outputs[0].ndim
        fail = sub['fail']
+        decl = "CudaNdarray* xview = NULL;"
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                                       self.idx_list,
+                                       view_ndim=view_ndim,
+                                       c_prefix='CudaNdarray',
+                                       strides_mul=4,
+                                       )
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
-        CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
+        xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
        if (!xview)
        {
            %(fail)s;
        }
-        if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
-                                       (PyObject*) NULL))
+        if (CudaNdarray_set_device_data(
+                xview,
+                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
+                (PyObject*) %(x)s))
        {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuSubtensor is not able to set the"
@@ -2294,43 +2299,24 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            %(fail)s;
        }
        cnda_mark_dev_structure_dirty(xview);
-        """ % locals()
+        for(int idx=0;idx <%(view_ndim)s; idx++){
+        //For broadcasted dimensions, set the strides to 0
-        get_xview = _define_set_data + \
+        //We can't do that only for broadcasted dimensions as this can happen
-                    self.helper_c_code(node, name, inputs, outputs, sub,
+        //for dimensions of size 0. That are rebroadcated later.
-                                       self.idx_list,
+            if(xview_dims[idx]==1)
-                                       c_prefix='CudaNdarray',
+                CudaNdarray_set_stride(xview, idx, 0);
-                                       set_data='CudaNdarray_set_device_data2',
+            else
-                                       set_dim='CudaNdarray_set_dim',
+                CudaNdarray_set_stride(xview, idx, xview_strides[idx]);
-                                       set_stride='CudaNdarray_set_stride',
+            CudaNdarray_set_dim(xview, idx, xview_dims[idx]);
-                                       update_flags="", strides_mul=4)
-        finish_view = ""
-        #For broadcasted dimensions, set the strides to 0
-        #We can't do that only for broadcasted dimensions as this can happen for dimensions of size 0,
-        #That are rebroadcated later.
-        for idx in range(node.outputs[0].ndim):
-            finish_view += """
-            if(CudaNdarray_HOST_DIMS(xview)[%(idx)s]==1)
-            CudaNdarray_set_stride(xview, %(idx)s, 0);
-            """ % locals()
-        finish_view += """
-        //Set the base only now
-        if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
-                                    %(x)s)){
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set"
-                         " the base of the view array");
-            Py_XDECREF(xview);
-            %(fail)s;
        }
+        """ % locals()
+        finish_view = """
        Py_XDECREF(%(z)s);
        %(z)s = xview;
        """ % locals()
-        return build_view + "{" + get_xview + "}" + finish_view
+        return decl + get_xview + build_view + finish_view
    def c_code_cache_version(self):
        hv = self.helper_c_code_cache_version()
@@ -2719,6 +2705,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        """ %locals()
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
@@ -2756,6 +2743,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
+    def decl_view(self):
+        return "CudaNdarray* zview = NULL;"
    def make_view_array(self, x, view_ndim):
        """
            :param x: a string identifying an array to be viewed
@@ -2765,17 +2755,32 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
            This doesn't need to actually set up the view with the
            right indexing; we'll do that manually later.
        """
-        return """CudaNdarray* zview = (CudaNdarray*)
+        ret = """zview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
-                CudaNdarray_New(%(view_ndim)s)""" % locals()
+        if (CudaNdarray_set_device_data(
+                zview,
+                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
+                (PyObject*) %(x)s))
+        {
+            zview = NULL;
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+        }else{
+            cnda_mark_dev_structure_dirty(zview);
+            for(int idx=0;idx <%(view_ndim)s; idx++){
+                if(xview_dims[idx]==1)
+                    CudaNdarray_set_stride(zview, idx, 0);
+                else
+                    CudaNdarray_set_stride(zview, idx, xview_strides[idx]);
+                CudaNdarray_set_dim(zview, idx, xview_dims[idx]);
+            }
+        }
+        """ % locals()
+        return ret
    def get_helper_c_code_args(self):
        """ Return a dictionary of arguments to use with helper_c_code"""
-        return { 'update_flags' : "",
+        return {'c_prefix': 'CudaNdarray',
-                'c_prefix' : 'CudaNdarray',
-                'set_data' :'CudaNdarray_set_device_data2',
-                'set_dim' : 'CudaNdarray_set_dim',
-                'set_stride' : 'CudaNdarray_set_stride',
-                'update_flags' : "",
                'strides_mul': 4
                }
@@ -2789,24 +2794,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
-    def define_set_data(self):
-        return _define_set_data
-    def link_view_array(self, x, fail):
-        return """
-        if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
-                                       (PyObject*) NULL))
-        {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set the"
-                         " devdata field of the view");
-            Py_XDECREF(zview);
-            %(fail)s;
-        }
-        cnda_mark_dev_structure_dirty(zview);
-        """ % locals()
    def set_view_base(self, x, fail):
        return """
        //Set the base only now
@@ -2823,7 +2810,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    def add_to_zview(self, x, fail):
        return """
        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
                                                        (PyObject *) py_%(x)s);
@@ -2839,7 +2825,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """ % locals()
    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if parent_version:
            return parent_version + (0,)

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -5,13 +5,14 @@ Generator code in SSJ package (L'Ecuyer & Simard)
 http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
 """
-import sys, warnings
+import warnings
 import numpy
 from theano import Op, Apply, shared, config, Variable
 from theano.tensor import (raw_random, TensorType, as_tensor_variable,
                           get_vector_length, cast, opt, scal)
-from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
+from theano.tensor import sqrt, log, sin, cos, join, prod
 from theano.compile import optdb
 from theano.gof import local_optimizer
 from theano.gof.python25 import all, any
@@ -36,6 +37,7 @@ def matVecModM(A, s, m):
                x[i] = r + m
    return x
 def multMatVect(v, A, m1, B, m2):
    #multiply the first half of v by A with a modulo of m1
    #and the second half by B with a modulo of m2
@@ -79,9 +81,11 @@ A2p134 = numpy.asarray(
     [1401213391, 1178684362, 1431130166]])
 np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
 def ff_2p134(rstate):
    return multMatVect(rstate, A1p134, M1, A2p134, M2)
 def ff_2p72(rstate):
    return multMatVect(rstate, A1p72, M1, A2p72, M2)
@@ -93,8 +97,8 @@ def mrg_next_value(rstate, new_rstate):
    #i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
    i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
    #first component
-    y1 = (((x12 & MASK12) << i22) + (x12 >> i9)
+    y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
-        + ((x13 & MASK13) << i7) + (x13 >> i24))
+          ((x13 & MASK13) << i7) + (x13 >> i24))
    assert type(y1) == numpy.int32
    if (y1 < 0 or y1 >= M1):     #must also check overflow
@@ -135,6 +139,7 @@ def mrg_next_value(rstate, new_rstate):
    else:
        return (x11 - x21) * NORM
 class mrg_uniform_base(Op):
    def __init__(self, output_type, inplace=False):
        Op.__init__(self)
@@ -145,17 +150,19 @@ class mrg_uniform_base(Op):
        self.warned_numpy_version = False
    def __eq__(self, other):
-        return type(self) == type(other) \
+        return (type(self) == type(other) and
-                and self.output_type == other.output_type \
+                self.output_type == other.output_type and
-                and self.inplace == other.inplace
+                self.inplace == other.inplace)
    def __hash__(self):
        return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
    def __str__(self):
        if self.inplace:
            s = "inplace"
-        else: s = "no_inplace"
+        else:
-        return self.__class__.__name__+"{%s,%s}"%(self.output_type,s)
+            s = "no_inplace"
+        return self.__class__.__name__ + "{%s,%s}" % (self.output_type, s)
    def make_node(self, rstate, size):
        # error checking slightly redundant here, since
@@ -166,7 +173,7 @@ class mrg_uniform_base(Op):
                     [rstate, size],
                     [rstate.type(), self.output_type()])
-    def grad(self,inputs,ograd):
+    def grad(self, inputs, ograd):
        return [None for i in inputs]
    def R_op(self, inputs, eval_points):
@@ -187,8 +194,8 @@ class mrg_uniform(mrg_uniform_base):
    def perform(self, node, inp, out):
        rstate, size = inp
        o_rstate, o_sample = out
-        numpy_version=numpy.__version__.split('.')
+        numpy_version = numpy.__version__.split('.')
-        if not self.warned_numpy_version and int(numpy_version[0])<=1 and int(numpy_version[1])<3:
+        if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 :
            print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
            self.warned_numpy_version = True
@@ -201,20 +208,21 @@ class mrg_uniform(mrg_uniform_base):
        for s in size:
            n_elements *= s
-        n_streams,_ = rstate.shape
+        n_streams, _ = rstate.shape
        rval = numpy.zeros(n_elements, dtype=self.output_type.dtype)
        err_orig = numpy.seterr(over='ignore')
        try:
            for i in xrange(n_elements):
-                sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
+                sample = mrg_next_value(rstate[i % n_streams],
+                                        rstate[i % n_streams])
                rval[i] = sample
        finally:
            numpy.seterr(**err_orig)
        o_rstate[0] = node.outputs[0].type.filter(rstate)  # send to GPU if necessary
-        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
+        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))  # send to GPU if necessary
    def c_code(self, node, name, inp, out, sub):
        rstate, size = inp
@@ -228,7 +236,7 @@ class mrg_uniform(mrg_uniform_base):
        fail = sub['fail']
        if self.output_type.dtype == 'float32':
            otype = 'float'
-            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
+            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
            # this was determined by finding the biggest number such that
            # numpy.float32(number * M1) < 1.0
        else:
@@ -279,7 +287,7 @@ class mrg_uniform(mrg_uniform_base):
        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
+            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
            n_elements *= odims[i];
            must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
            //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
@@ -313,8 +321,8 @@ class mrg_uniform(mrg_uniform_base):
        }
        n_streams = PyArray_DIMS(%(o_rstate)s)[0];
-        sample_data = (%(otype)s *) %(o_sample)s->data;
+        sample_data = (%(otype)s *) PyArray_DATA(%(o_sample)s);
-        state_data = (npy_int32 *) %(o_rstate)s->data;
+        state_data = (npy_int32 *) PyArray_DATA(%(o_rstate)s);
        for (int i = 0; i < n_elements; ++i)
        {
            npy_int32 * state_data_i = state_data + (i%%n_streams)*6;
@@ -392,7 +400,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
    def c_support_code_apply(self, node, nodename):
        if self.output_type.dtype == 'float32':
            otype = 'float'
-            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
+            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
            # this was determined by finding the biggest number such that
            # numpy.float32(number * M1) < 1.0
        else:
@@ -476,7 +484,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
            }
        }
-        """ %locals()
+        """ % locals()
    def c_code(self, node, nodename, inp, out, sub):
        rstate, size = inp
@@ -491,7 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        else:
            otype = 'double'
-        SYNC="CNDA_THREAD_SYNC";
+        SYNC = "CNDA_THREAD_SYNC"
        return """
        //////// <code generated by mrg_uniform>
@@ -521,7 +529,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        }
        for (int i = 0; i < %(ndim)s; ++i)
        {
-            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
+            odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
            n_elements *= odims[i];
            must_alloc_sample = (must_alloc_sample
                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
@@ -593,7 +601,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        }
        //////// </ code generated by mrg_uniform>
-        """ %locals()
+        """ % locals()
    def c_code_cache_version(self):
        return (7,)
@@ -662,7 +671,7 @@ class MRG_RandomStreams(object):
            elif seed >= M2:
                raise ValueError('seed should be less than %i' % M2, seed)
            self.rstate = numpy.asarray([seed]*6, dtype='int32')
-        elif len(seed)==6:
+        elif len(seed) == 6:
            if seed[0] == 0 and seed[1] == 0 and seed[2] == 0:
                raise ValueError('The first 3 values of seed should not be all 0', seed)
            if seed[3] == 0 and seed[4] == 0 and seed[5] == 0:
@@ -690,7 +699,7 @@ class MRG_RandomStreams(object):
        """
        assert n_streams < 2**72
        assert n_streams > 0
-        rval = numpy.zeros((n_streams,6), dtype='int32')
+        rval = numpy.zeros((n_streams, 6), dtype='int32')
        rval[0] = self.rstate
        for i in xrange(1, n_streams):
            rval[i] = ff_2p72(rval[i - 1])
@@ -776,11 +785,13 @@ class MRG_RandomStreams(object):
            # currently no Theano node that will do a frombuffer
            # reinterpretation.
            u = self.pretty_return(node_rstate,
-                    *GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
+                                   *GPU_mrg_uniform.new(node_rstate,
+                                                        ndim, dtype, size))
        else:
            node_rstate = shared(self.get_substream_rstates(nstreams))
            u = self.pretty_return(node_rstate,
-                    *mrg_uniform.new(node_rstate, ndim, dtype, size))
+                                   *mrg_uniform.new(node_rstate,
+                                                    ndim, dtype, size))
        r = u * (high - low) + low
        if u.type.broadcastable != r.type.broadcastable:
@@ -934,4 +945,6 @@ def mrg_random_make_inplace(node):
        new_op = op.__class__(op.output_type, inplace=True)
        return new_op.make_node(*node.inputs).outputs
    return False
-optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
+optdb.register('random_make_inplace_mrg',
+               opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
+               99, 'fast_run', 'inplace')
--- a/theano/scan_module/scan_perform.c.txt
+++ b/theano/scan_module/scan_perform.c.txt
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy
 def get_version():
-    return 0.278
+    return 0.279
 @cython.boundscheck(False)
 def perform(

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -11,7 +11,7 @@ _logger = logging.getLogger('theano.scan_module.scan_perform')
 _logger.setLevel(logging.WARN)
-version = 0.278  # must match constant returned in function get_version()
+version = 0.280  # must match constant returned in function get_version()
 need_reload = False
@@ -52,11 +52,8 @@ except ImportError:
            _logger.info("Compiling C code for scan")
            dirname = 'scan_perform'
-            # We use a .txt extensions as otherwise it don't get
-            # included when we create a package to send to pypi
-            # This happen even if we tell to include *.c files
            cfile = os.path.join(theano.__path__[0], 'scan_module',
-                                 'scan_perform.c.txt')
+                                 'scan_perform.c')
            code = open(cfile).read()
            loc = os.path.join(config.compiledir, dirname)
            if not os.path.exists(loc):

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1795,9 +1795,9 @@ class AddSD(gof.op.Op):
                }
                npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-                const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+                const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-                const npy_int32 * __restrict__ indices = (npy_int32*)%(_indices)s->data;
+                const npy_int32 * __restrict__ indices = (npy_int32*)PyArray_DATA(%(_indices)s);
-                const dtype_%(_data)s* __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+                const dtype_%(_data)s* __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
                dtype_%(y)s* ydata = (dtype_%(y)s*)PyArray_DATA(%(y)s);
                dtype_%(z)s* zdata = (dtype_%(z)s*)PyArray_DATA(%(z)s);
@@ -2983,10 +2983,10 @@ class StructuredDotGradCSC(gof.Op):
        if (PyArray_NDIM(%(_indices)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1"); %(fail)s;}
        if (PyArray_NDIM(%(_indptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1"); %(fail)s;}
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if( PyArray_DIMS(%(_d)s)[1] != PyArray_DIMS(%(_g)s)[1])
@@ -2996,29 +2996,29 @@ class StructuredDotGradCSC(gof.Op):
            || (PyArray_DIMS(%(_zout)s)[0] != PyArray_DIMS(%(_indices)s)[0]))
        {
            Py_XDECREF(%(_zout)s);
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_g)s)->type_num);
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_g)s));
        }
        {   //makes it compile even though labels jump over variable definitions.
            npy_intp nnz = PyArray_DIMS(%(_indices)s)[0];
            npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1; //TODO: error checking with this
-            npy_intp Sindices = %(_indices)s->strides[0]/PyArray_DESCR(%(_indices)s)->elsize;
+            npy_intp Sindices = PyArray_STRIDES(%(_indices)s)[0]/PyArray_DESCR(%(_indices)s)->elsize;
-            npy_intp Sindptr = %(_indptr)s->strides[0]/PyArray_DESCR(%(_indptr)s)->elsize;
+            npy_intp Sindptr = PyArray_STRIDES(%(_indptr)s)[0]/PyArray_DESCR(%(_indptr)s)->elsize;
-            const npy_intp Sd1 = %(_d)s->strides[1]/PyArray_DESCR(%(_d)s)->elsize;
+            const npy_intp Sd1 = PyArray_STRIDES(%(_d)s)[1]/PyArray_DESCR(%(_d)s)->elsize;
-            const npy_intp Sg1 = %(_g)s->strides[1]/PyArray_DESCR(%(_g)s)->elsize;
+            const npy_intp Sg1 = PyArray_STRIDES(%(_g)s)[1]/PyArray_DESCR(%(_g)s)->elsize;
            const npy_intp K = PyArray_DIMS(%(_d)s)[1];
-            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
            // loop over columns
            for (npy_int32 j = 0; j < N; ++j)
            {
                // extract j-th row of dense matrix
-                const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(%(_d)s->data + %(_d)s->strides[0] * j);
+                const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(PyArray_BYTES(%(_d)s) + PyArray_STRIDES(%(_d)s)[0] * j);
                if(j >= PyArray_DIMS(%(_d)s)[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}
                // for each non-null value in the sparse column
@@ -3028,7 +3028,7 @@ class StructuredDotGradCSC(gof.Op):
                    npy_int32 i = indices[i_idx * Sindices];
                    // extract corresponding row in gradient
-                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(%(_g)s->data + %(_g)s->strides[0] * i);
+                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(PyArray_BYTES(%(_g)s) + PyArray_STRIDES(%(_g)s)[0] * i);
                    double ip = 0.0;
                    // make sure that row index is not bigger than actual number of rows
@@ -3044,7 +3044,7 @@ class StructuredDotGradCSC(gof.Op):
                    }
                    // write resulting gradient to sparse output
-                    ((dtype_%(_zout)s* __restrict__)(%(_zout)s->data + i_idx * %(_zout)s->strides[0]))[0] = ip;
+                    ((dtype_%(_zout)s* __restrict__)(PyArray_BYTES(%(_zout)s) + i_idx * PyArray_STRIDES(%(_zout)s)[0]))[0] = ip;
                }
            }
        }
@@ -3119,10 +3119,10 @@ class StructuredDotGradCSR(gof.Op):
        if (PyArray_NDIM(%(_indices)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indices) != 1"); %(fail)s;}
        if (PyArray_NDIM(%(_indptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1"); %(fail)s;}
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if( PyArray_DIMS(%(_d)s)[1] != PyArray_DIMS(%(_g)s)[1])
@@ -3132,7 +3132,7 @@ class StructuredDotGradCSR(gof.Op):
            || (PyArray_DIMS(%(_zout)s)[0] != PyArray_DIMS(%(_indices)s)[0]))
        {
            Py_XDECREF(%(_zout)s);
-            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_g)s)->type_num);
+            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_g)s));
        }
        {   //makes it compile even though labels jump over variable definitions.
@@ -3140,16 +3140,16 @@ class StructuredDotGradCSR(gof.Op):
            // extract number of rows
            npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1; //TODO: error checking with this
-            npy_intp Sindices = %(_indices)s->strides[0]/PyArray_DESCR(%(_indices)s)->elsize;
+            npy_intp Sindices = PyArray_STRIDES(%(_indices)s)[0]/PyArray_DESCR(%(_indices)s)->elsize;
-            npy_intp Sindptr = %(_indptr)s->strides[0]/PyArray_DESCR(%(_indptr)s)->elsize;
+            npy_intp Sindptr = PyArray_STRIDES(%(_indptr)s)[0]/PyArray_DESCR(%(_indptr)s)->elsize;
-            const npy_intp Sd1 = %(_d)s->strides[1]/PyArray_DESCR(%(_d)s)->elsize;
+            const npy_intp Sd1 = PyArray_STRIDES(%(_d)s)[1]/PyArray_DESCR(%(_d)s)->elsize;
-            const npy_intp Sg1 = %(_g)s->strides[1]/PyArray_DESCR(%(_g)s)->elsize;
+            const npy_intp Sg1 = PyArray_STRIDES(%(_g)s)[1]/PyArray_DESCR(%(_g)s)->elsize;
            const npy_intp K = PyArray_DIMS(%(_d)s)[1];
-            const npy_int32 * __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
            // loop over columns of sparse matrix
            for (npy_int32 i = 0; i < N; ++i)
@@ -3161,11 +3161,11 @@ class StructuredDotGradCSR(gof.Op):
                    npy_int32 j = indices[j_idx * Sindices];
                    // extract j-th row of dense matrix
-                    const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(%(_d)s->data + %(_d)s->strides[0] * j);
+                    const dtype_%(_d)s* __restrict__ d_row = (dtype_%(_d)s*)(PyArray_BYTES(%(_d)s) + PyArray_STRIDES(%(_d)s)[0] * j);
                    if(j >= PyArray_DIMS(%(_d)s)[0]) {PyErr_SetString(PyExc_NotImplementedError, "G"); %(fail)s;}
                    // extract corresponding row in gradient
-                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(%(_g)s->data + %(_g)s->strides[0] * i);
+                    const dtype_%(_g)s* __restrict__ g_row = (dtype_%(_g)s*)(PyArray_BYTES(%(_g)s) + PyArray_STRIDES(%(_g)s)[0] * i);
                    double ip = 0.0;
                    // make sure that row index is not bigger than actual number of rows
@@ -3181,7 +3181,7 @@ class StructuredDotGradCSR(gof.Op):
                    }
                    // write resulting gradient to sparse output
-                    ((dtype_%(_zout)s* __restrict__)(%(_zout)s->data + j_idx * %(_zout)s->strides[0]))[0] = ip;
+                    ((dtype_%(_zout)s* __restrict__)(PyArray_BYTES(%(_zout)s) + j_idx * PyArray_STRIDES(%(_zout)s)[0]))[0] = ip;
                }
            }
        }

--- a/theano/sparse/opt.py
+++ b/theano/sparse/opt.py
@@ -142,19 +142,19 @@ class StructuredDotCSC(gof.Op):
        if (PyArray_NDIM(%(a_nrows)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
        if (PyArray_NDIM(%(b)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
-        if (PyArray_DESCR(%(a_val)s)->type_num != %(typenum_a_val)s) {
+        if (PyArray_TYPE(%(a_val)s) != %(typenum_a_val)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for a_val"); %(fail)s;}
-        if (PyArray_DESCR(%(b)s)->type_num != %(typenum_b)s) {
+        if (PyArray_TYPE(%(b)s) != %(typenum_b)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for b"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
+        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(a_nrows)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(a_nrows)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "a_nrows dtype not INT32"); %(fail)s;}
        if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -164,13 +164,13 @@ class StructuredDotCSC(gof.Op):
        {PyErr_SetString(PyExc_NotImplementedError, "a's number of columns doesn't match b's rows"); %(fail)s;}
        if ((!%(z)s)
-            || (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)%(a_nrows)s->data)[0])
+            || (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)PyArray_DATA(%(a_nrows)s))[0])
            || (PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(b)s)[1])
            )
        {
            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0, 0};
-            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
+            dims[0] = ((npy_int32 *)PyArray_DATA(%(a_nrows)s))[0];
            dims[1] = PyArray_DIMS(%(b)s)[1];
            %(z)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_z)s);
        }
@@ -182,19 +182,19 @@ class StructuredDotCSC(gof.Op):
            npy_intp K = PyArray_DIMS(%(b)s)[0];
            // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Szm = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szm = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szn = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            //npy_intp Sbm = %(b)s->strides[0] / PyArray_DESCR(%(b)s)->elsize;
+            //npy_intp Sbm = PyArray_STRIDES(%(b)s)[0] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sbn = %(b)s->strides[1] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sbn = PyArray_STRIDES(%(b)s)[1] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sval = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sval = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Sptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
            // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s*     __restrict__ Dz   = (dtype_%(z)s*)%(z)s->data;
+            dtype_%(z)s*     __restrict__ Dz   = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
+            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(a_ind)s;
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(a_ptr)s;
            //npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
@@ -218,7 +218,7 @@ class StructuredDotCSC(gof.Op):
            for (npy_int32 k = 0; k < K; ++k)
            {
                // get pointer to k-th row of dense matrix
-                const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
+                const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(PyArray_BYTES(%(b)s) + PyArray_STRIDES(%(b)s)[0] * k);
                // loop over sparse column indices through index pointer array
                // (amounts to looping over rows M of sparse matrix)
@@ -229,7 +229,7 @@ class StructuredDotCSC(gof.Op):
                    const dtype_%(a_val)s Amk = Dval[m_idx * Sval]; // actual value at that location
                    // pointer to m-th row of the output matrix Z
-                    dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
+                    dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(PyArray_BYTES(%(z)s) + PyArray_STRIDES(%(z)s)[0] * m);
                    //RESOLVE: a.shape[0] equals z.shape[0], why is this not an equality constraint?
                    if (m >= PyArray_DIMS(%(z)s)[0])
@@ -330,10 +330,10 @@ class StructuredDotCSR(gof.Op):
        if (PyArray_NDIM(%(a_ptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(a_ptr) != 1"); %(fail)s;}
        if (PyArray_NDIM(%(b)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(b) != 2"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
+        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
        if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -358,19 +358,19 @@ class StructuredDotCSR(gof.Op):
            npy_intp K = PyArray_DIMS(%(b)s)[0];
            // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Szm = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szm = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Szn = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Sbm = %(b)s->strides[0] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sbm = PyArray_STRIDES(%(b)s)[0] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sbn = %(b)s->strides[1] / PyArray_DESCR(%(b)s)->elsize;
+            npy_intp Sbn = PyArray_STRIDES(%(b)s)[1] / PyArray_DESCR(%(b)s)->elsize;
-            npy_intp Sval = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sval = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Sptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
            // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
+            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
+            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(a_ind)s);
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(a_ptr)s);
            //npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
@@ -393,7 +393,7 @@ class StructuredDotCSR(gof.Op):
            for (npy_int64 m = 0; m < M; ++m)
            {
                // pointer to m-th row of the output matrix Z
-                dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(%(z)s->data + %(z)s->strides[0] * m);
+                dtype_%(z)s* __restrict__ zm = (dtype_%(z)s*)(PyArray_BYTES(%(z)s) + PyArray_STRIDES(%(z)s)[0] * m);
                // loop over sparse rows indices through index pointer array
                // (amounts to looping over cols k of sparse matrix)
@@ -403,7 +403,7 @@ class StructuredDotCSR(gof.Op):
                    const dtype_%(a_val)s Amk = Dval[k_idx * Sval]; // actual value at that location
                    // get pointer to k-th row of dense matrix
-                    const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(%(b)s->data + %(b)s->strides[0] * k);
+                    const dtype_%(b)s* __restrict__ bk = (dtype_%(b)s*)(PyArray_BYTES(%(b)s) + PyArray_STRIDES(%(b)s)[0] * k);
                    // loop over final dimension (cols of dense matrix) and perform dot product
                    for(npy_int32 n = 0; n < N; ++n)
@@ -566,25 +566,25 @@ class UsmmCscDense(gof.Op):
        if (PyArray_NDIM(%(x_nrows)s) != 0) {PyErr_SetString(PyExc_NotImplementedError, "rank(nrows) != 0"); %(fail)s;}
        if (PyArray_NDIM(%(y)s) != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
-        if (PyArray_DESCR(%(x_val)s)->type_num != %(typenum_x_val)s) {
+        if (PyArray_TYPE(%(x_val)s) != %(typenum_x_val)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for x_val"); %(fail)s;}
-        if (PyArray_DESCR(%(y)s)->type_num != %(typenum_y)s) {
+        if (PyArray_TYPE(%(y)s) != %(typenum_y)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for y"); %(fail)s;}
-        if (PyArray_DESCR(%(z)s)->type_num != %(typenum_z)s) {
+        if (PyArray_TYPE(%(z)s) != %(typenum_z)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for z"); %(fail)s;}
-        if (PyArray_DESCR(%(alpha)s)->type_num != %(typenum_alpha)s) {
+        if (PyArray_TYPE(%(alpha)s) != %(typenum_alpha)s) {
        PyErr_SetString(PyExc_NotImplementedError, "Invalid type for alpha"); %(fail)s;}
-        if (PyArray_DESCR(%(x_ind)s)->type_num != NPY_INT32) {
+        if (PyArray_TYPE(%(x_ind)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "x_ind dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(x_ptr)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(x_ptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "x_ptr dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(x_nrows)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(x_nrows)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "x_nrows dtype not INT32"); %(fail)s;}
        if (PyArray_DIMS(%(x_val)s)[0] != PyArray_DIMS(%(x_ind)s)[0])
@@ -593,7 +593,7 @@ class UsmmCscDense(gof.Op):
        if (PyArray_DIMS(%(x_ptr)s)[0] != PyArray_DIMS(%(y)s)[0]+1)
        {PyErr_SetString(PyExc_NotImplementedError, "x's number of columns doesn't match y's rows"); %(fail)s;}
-        if (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)%(x_nrows)s->data)[0] || PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(y)s)[1])
+        if (PyArray_DIMS(%(z)s)[0] != ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0] || PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(y)s)[1])
        {PyErr_SetString(PyExc_NotImplementedError, "The dimension of the allocated output doesn't match the correct output size."); %(fail)s;}
        if (PyArray_SIZE(%(alpha)s) != 1)
@@ -621,13 +621,13 @@ class UsmmCscDense(gof.Op):
            Py_INCREF(%(zn)s);
        }
        else if (!%(zn)s
-            || (PyArray_DIMS(%(zn)s)[0] != ((npy_int32 *)%(x_nrows)s->data)[0])
+            || (PyArray_DIMS(%(zn)s)[0] != ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0])
            || (PyArray_DIMS(%(zn)s)[1] != PyArray_DIMS(%(y)s)[1])
            )
        {
            {Py_XDECREF(%(zn)s);}
            npy_intp dims[] = {0, 0};
-            dims[0] = ((npy_int32 *)%(x_nrows)s->data)[0];
+            dims[0] = ((npy_int32 *)PyArray_DATA(%(x_nrows)s))[0];
            dims[1] = PyArray_DIMS(%(y)s)[1];
            %(zn)s = (PyArrayObject*) PyArray_SimpleNew(2, dims, %(typenum_zn)s);
        }
@@ -639,17 +639,17 @@ class UsmmCscDense(gof.Op):
            npy_intp K = PyArray_DIMS(%(y)s)[0];
            // pointers to access actual data in the arrays passed as params.
-            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
+            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)PyArray_DATA(%(x_val)s);
-            const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
+            const npy_int32 * __restrict__ Dind = (npy_int32*)PyArray_DATA(%(x_ind)s);
-            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
+            const npy_int32 * __restrict__ Dptr = (npy_int32*)PyArray_DATA(%(x_ptr)s);
-            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)%(alpha)s->data)[0];
+            const dtype_%(alpha)s alpha = ((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0];
-            npy_intp Sz = %(z)s->strides[1] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sz = PyArray_STRIDES(%(z)s)[1] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Szn = %(zn)s->strides[1] / PyArray_DESCR(%(zn)s)->elsize;
+            npy_intp Szn = PyArray_STRIDES(%(zn)s)[1] / PyArray_DESCR(%(zn)s)->elsize;
-            npy_intp Sval = %(x_val)s->strides[0] / PyArray_DESCR(%(x_val)s)->elsize;
+            npy_intp Sval = PyArray_STRIDES(%(x_val)s)[0] / PyArray_DESCR(%(x_val)s)->elsize;
-            npy_intp Sind = %(x_ind)s->strides[0] / PyArray_DESCR(%(x_ind)s)->elsize;
+            npy_intp Sind = PyArray_STRIDES(%(x_ind)s)[0] / PyArray_DESCR(%(x_ind)s)->elsize;
-            npy_intp Sptr = %(x_ptr)s->strides[0] / PyArray_DESCR(%(x_ptr)s)->elsize;
+            npy_intp Sptr = PyArray_STRIDES(%(x_ptr)s)[0] / PyArray_DESCR(%(x_ptr)s)->elsize;
-            npy_intp Sy = %(y)s->strides[1] / PyArray_DESCR(%(y)s)->elsize;
+            npy_intp Sy = PyArray_STRIDES(%(y)s)[1] / PyArray_DESCR(%(y)s)->elsize;
            if (!(%(inplace)s))
@@ -669,14 +669,14 @@ class UsmmCscDense(gof.Op):
                    const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
-                    dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
+                    dtype_%(y)s* y_row = (dtype_%(y)s*)(PyArray_BYTES(%(y)s) + PyArray_STRIDES(%(y)s)[0] * k);
                    // axpy expects pointer to the beginning of memory arrays,
                    // so when the stride is negative, we need to get the
                    // last element
                    if (Sy < 0)
                        y_row += (K - 1) * Sy;
-                    dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
+                    dtype_%(zn)s* z_row = (dtype_%(zn)s*)(PyArray_BYTES(%(zn)s) + PyArray_STRIDES(%(zn)s)[0] * m);
                    if (Szn < 0)
                        z_row += (N - 1) * Szn;
@@ -775,16 +775,16 @@ class CSMGradC(gof.Op):
        if (PyArray_NDIM(%(b_ind)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ind) != 1"); %(fail)s;}
        if (PyArray_NDIM(%(b_ptr)s) != 1) {PyErr_SetString(PyExc_NotImplementedError, "rank(b_ptr) != 1"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ind)s)->type_num != NPY_INT32) {
+        if (PyArray_TYPE(%(a_ind)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "a_ind dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(a_ptr)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(a_ptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "a_ptr dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(b_ind)s)->type_num != NPY_INT32) {
+        if (PyArray_TYPE(%(b_ind)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "b_ind dtype not INT32"); %(fail)s;}
-        if (PyArray_DESCR(%(b_ptr)s)->type_num != NPY_INT32)
+        if (PyArray_TYPE(%(b_ptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "b_ptr dtype not INT32"); %(fail)s;}
        if (PyArray_DIMS(%(a_val)s)[0] != PyArray_DIMS(%(a_ind)s)[0])
@@ -807,28 +807,28 @@ class CSMGradC(gof.Op):
        {
            // sparse array has size MxK, dense KxN, output MxN
            npy_intp M = PyArray_DIMS(%(a_ptr)s)[0] - 1;
-            npy_intp a_dim_0 = ((npy_int32 *)%(a_dim)s->data)[0];
+            npy_intp a_dim_0 = ((npy_int32 *)PyArray_DATA(%(a_dim)s))[0];
-            npy_intp a_dim_1 = ((npy_int32 *)%(a_dim)s->data)[1];
+            npy_intp a_dim_1 = ((npy_int32 *)PyArray_DATA(%(a_dim)s))[1];
            npy_intp sp_dim = (M == a_dim_0)?a_dim_1:a_dim_0;
            // strides tell you how many bytes to skip to go to next column/row entry
-            npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
+            npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
-            npy_intp Sa_val = %(a_val)s->strides[0] / PyArray_DESCR(%(a_val)s)->elsize;
+            npy_intp Sa_val = PyArray_STRIDES(%(a_val)s)[0] / PyArray_DESCR(%(a_val)s)->elsize;
-            npy_intp Sa_ind = %(a_ind)s->strides[0] / PyArray_DESCR(%(a_ind)s)->elsize;
+            npy_intp Sa_ind = PyArray_STRIDES(%(a_ind)s)[0] / PyArray_DESCR(%(a_ind)s)->elsize;
-            npy_intp Sa_ptr = %(a_ptr)s->strides[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
+            npy_intp Sa_ptr = PyArray_STRIDES(%(a_ptr)s)[0] / PyArray_DESCR(%(a_ptr)s)->elsize;
-            npy_intp Sb_val = %(b_val)s->strides[0] / PyArray_DESCR(%(b_val)s)->elsize;
+            npy_intp Sb_val = PyArray_STRIDES(%(b_val)s)[0] / PyArray_DESCR(%(b_val)s)->elsize;
-            npy_intp Sb_ind = %(b_ind)s->strides[0] / PyArray_DESCR(%(b_ind)s)->elsize;
+            npy_intp Sb_ind = PyArray_STRIDES(%(b_ind)s)[0] / PyArray_DESCR(%(b_ind)s)->elsize;
-            npy_intp Sb_ptr = %(b_ptr)s->strides[0] / PyArray_DESCR(%(b_ptr)s)->elsize;
+            npy_intp Sb_ptr = PyArray_STRIDES(%(b_ptr)s)[0] / PyArray_DESCR(%(b_ptr)s)->elsize;
            // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
+            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)PyArray_DATA(%(z)s);
-            const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)%(a_val)s->data;
+            const dtype_%(a_val)s* __restrict__ Da_val = (dtype_%(a_val)s*)PyArray_DATA(%(a_val)s);
-            const npy_int32 * __restrict__ Da_ind = (npy_int32*)%(a_ind)s->data;
+            const npy_int32 * __restrict__ Da_ind = (npy_int32*)PyArray_DATA(%(a_ind)s);
-            const npy_int32 * __restrict__ Da_ptr = (npy_int32*)%(a_ptr)s->data;
+            const npy_int32 * __restrict__ Da_ptr = (npy_int32*)PyArray_DATA(%(a_ptr)s);
-            const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)%(b_val)s->data;
+            const dtype_%(b_val)s* __restrict__ Db_val = (dtype_%(b_val)s*)PyArray_DATA(%(b_val)s);
-            const npy_int32 * __restrict__ Db_ind = (npy_int32*)%(b_ind)s->data;
+            const npy_int32 * __restrict__ Db_ind = (npy_int32*)PyArray_DATA(%(b_ind)s);
-            const npy_int32 * __restrict__ Db_ptr = (npy_int32*)%(b_ptr)s->data;
+            const npy_int32 * __restrict__ Db_ptr = (npy_int32*)PyArray_DATA(%(b_ptr)s);
            npy_intp nnz = PyArray_DIMS(%(a_ind)s)[0];
@@ -937,10 +937,10 @@ class MulSDCSC(gof.Op):
            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
            %(fail)s;}
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if (!%(_zout)s ||
@@ -949,7 +949,7 @@ class MulSDCSC(gof.Op):
        {
            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                  PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
+                  PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
            if (!%(_zout)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -963,13 +963,13 @@ class MulSDCSC(gof.Op):
            //TODO: error checking with this
            const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
-            const npy_intp Sb = %(_b)s->strides[0];
+            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0];
            // loop over columns
            for (npy_int32 j = 0; j < N; ++j)
@@ -981,7 +981,7 @@ class MulSDCSC(gof.Op):
                    npy_int32 i = indices[i_idx];
                    // extract i-th row of dense matrix
-                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * i);
+                    const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(PyArray_BYTES(%(_b)s) + Sb * i);
                    // write resulting gradient to sparse output
                    zout[i_idx] = data[i_idx] * b_row[j];
@@ -1053,10 +1053,10 @@ class MulSDCSR(gof.Op):
            PyErr_SetString(PyExc_NotImplementedError, "rank(indptr) != 1");
            %(fail)s;}
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if (!%(_zout)s ||
@@ -1065,7 +1065,7 @@ class MulSDCSR(gof.Op):
        {
            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
+                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
            if (!%(_zout)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -1079,19 +1079,19 @@ class MulSDCSR(gof.Op):
            //TODO: error checking with this
            const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
-            const npy_intp Sb = %(_b)s->strides[0];
+            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0];
            // loop over columns
            for (npy_int32 j = 0; j < N; ++j)
            {
                // extract i-th row of dense matrix
-                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(%(_b)s->data + Sb * j);
+                const dtype_%(_b)s* __restrict__ b_row = (dtype_%(_b)s*)(PyArray_BYTES(%(_b)s) + Sb * j);
                // for each non-null value in the sparse column
                for (npy_int32 i_idx = indptr[j]; i_idx < indptr[j+1]; ++i_idx)
@@ -1209,10 +1209,10 @@ class MulSVCSR(gof.Op):
            %(fail)s;
        }
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if (!%(_zout)s
@@ -1221,7 +1221,7 @@ class MulSVCSR(gof.Op):
        {
            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
+                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
        }
        { //makes it compile even though labels jump over variable definitions.
@@ -1229,15 +1229,15 @@ class MulSVCSR(gof.Op):
            //TODO: error checking with this
            const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)PyArray_DATA(%(_b)s);
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
-            const npy_intp Sb = %(_b)s->strides[0] / PyArray_DESCR(%(_b)s)->elsize;
+            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0] / PyArray_DESCR(%(_b)s)->elsize;
            // loop over rows
            for (npy_int32 j = 0; j < N; ++j)
@@ -1359,10 +1359,10 @@ class StructuredAddSVCSR(gof.Op):
            %(fail)s;
        }
-        if( PyArray_DESCR(%(_indices)s)->type_num != NPY_INT32) {
+        if( PyArray_TYPE(%(_indices)s) != NPY_INT32) {
        PyErr_SetString(PyExc_NotImplementedError, "C"); %(fail)s;}
-        if( PyArray_DESCR(%(_indptr)s)->type_num != NPY_INT32)
+        if( PyArray_TYPE(%(_indptr)s) != NPY_INT32)
        {PyErr_SetString(PyExc_NotImplementedError, "D"); %(fail)s;}
        if (!%(_zout)s
@@ -1371,7 +1371,7 @@ class StructuredAddSVCSR(gof.Op):
        {
            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                    PyArray_DIMS(%(_indices)s), PyArray_DESCR(%(_b)s)->type_num);
+                    PyArray_DIMS(%(_indices)s), PyArray_TYPE(%(_b)s));
            if (!%(_zout)s)
            {
                PyErr_SetString(PyExc_MemoryError,
@@ -1385,15 +1385,15 @@ class StructuredAddSVCSR(gof.Op):
            //TODO: error checking with this
            const npy_intp N =  PyArray_DIMS(%(_indptr)s)[0]-1;
-            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)%(_data)s->data;
+            const dtype_%(_data)s * const __restrict__ data = (dtype_%(_data)s*)PyArray_DATA(%(_data)s);
-            const npy_int32 * const __restrict__ indptr = (npy_int32 *)%(_indptr)s->data;
+            const npy_int32 * const __restrict__ indptr = (npy_int32 *)PyArray_DATA(%(_indptr)s);
-            const npy_int32 * const __restrict__ indices = (npy_int32 *)%(_indices)s->data;
+            const npy_int32 * const __restrict__ indices = (npy_int32 *)PyArray_DATA(%(_indices)s);
-            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)%(_b)s->data;
+            const dtype_%(_b)s* __restrict__ Db = (dtype_%(_b)s*)PyArray_DATA(%(_b)s);
-            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)%(_zout)s->data;
+            dtype_%(_zout)s * const __restrict__ zout = (dtype_%(_zout)s*)PyArray_DATA(%(_zout)s);
-            const npy_intp Sb = %(_b)s->strides[0] / PyArray_DESCR(%(_b)s)->elsize;
+            const npy_intp Sb = PyArray_STRIDES(%(_b)s)[0] / PyArray_DESCR(%(_b)s)->elsize;
            // loop over columns
            for (npy_int32 j = 0; j < N; ++j)
@@ -1575,17 +1575,17 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
        if (PyArray_NDIM(%(y)s) != 2) {
 PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
-        if (PyArray_DESCR(%(x)s)->type_num != %(typenum_x)s) {
+        if (PyArray_TYPE(%(x)s) != %(typenum_x)s) {
            PyErr_SetString(PyExc_NotImplementedError,
                            "Invalid type for x");
            %(fail)s;}
-        if (PyArray_DESCR(%(y)s)->type_num != %(typenum_y)s) {
+        if (PyArray_TYPE(%(y)s) != %(typenum_y)s) {
            PyErr_SetString(PyExc_NotImplementedError,
                            "Invalid type for y");
            %(fail)s;}
-        if (PyArray_DESCR(%(p_data)s)->type_num != %(typenum_p)s) {
+        if (PyArray_TYPE(%(p_data)s) != %(typenum_p)s) {
            PyErr_SetString(PyExc_NotImplementedError,
                            "Invalid type for pattern");
            %(fail)s;}
@@ -1595,7 +1595,7 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
              "x's number of columns doesn't match y's rows! Note: sampling_dot is different from dot because y is assumed to be transposed.");
            %(fail)s;}
-        if (PyArray_DIMS(%(y)s)[0] != ((npy_int32 *)%(p_ncols)s->data)[0] ||
+        if (PyArray_DIMS(%(y)s)[0] != ((npy_int32 *)PyArray_DATA(%(p_ncols)s))[0] ||
            PyArray_DIMS(%(x)s)[0] != (PyArray_DIMS(%(p_ptr)s)[0] - 1))
        {PyErr_SetString(PyExc_NotImplementedError,
        "The dimension of the pattern and the output must match"); %(fail)s;}
@@ -1603,7 +1603,7 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
        // Allocate output
        if (!%(z_data)s
            || (PyArray_DIMS(%(z_data)s)[0] != PyArray_DIMS(%(p_data)s)[0])
-            || (PyArray_DESCR(%(z_data)s)->type_num != %(typenum_zd)s)
+            || (PyArray_TYPE(%(z_data)s) != %(typenum_zd)s)
            || !(PyArray_ISCONTIGUOUS(%(z_data)s)))
         {
            {Py_XDECREF(%(z_data)s);}
@@ -1614,7 +1614,7 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
        }
        if (!%(z_ind)s
            || (PyArray_DIMS(%(z_ind)s)[0] != PyArray_DIMS(%(p_ind)s)[0])
-            || (PyArray_DESCR(%(z_ind)s)->type_num != %(typenum_zi)s)
+            || (PyArray_TYPE(%(z_ind)s) != %(typenum_zi)s)
            || !(PyArray_ISCONTIGUOUS(%(z_ind)s)))
        {
            {Py_XDECREF(%(z_ind)s);}
@@ -1625,7 +1625,7 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
        }
        if (!%(z_ptr)s
            || (PyArray_DIMS(%(z_ptr)s)[0] != PyArray_DIMS(%(p_ptr)s)[0])
-            || (PyArray_DESCR(%(z_ptr)s)->type_num != %(typenum_zp)s)
+            || (PyArray_TYPE(%(z_ptr)s) != %(typenum_zp)s)
            || !(PyArray_ISCONTIGUOUS(%(z_ptr)s)))
        {
            {Py_XDECREF(%(z_ptr)s);}
@@ -1642,23 +1642,23 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
            npy_intp K = PyArray_DIMS(%(y)s)[1];
            // pointers to access actual data in the arrays passed as params.
-            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)%(x)s->data;
+            const dtype_%(x)s* __restrict__ Dx = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)%(y)s->data;
+            const dtype_%(y)s* __restrict__ Dy = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)%(p_data)s->data;
+            const dtype_%(p_data)s* __restrict__ Dpd = (dtype_%(p_data)s*)PyArray_DATA(%(p_data)s);
-            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)%(p_ind)s->data;
+            const dtype_%(p_ind)s* __restrict__ Dpi = (dtype_%(p_ind)s*)PyArray_DATA(%(p_ind)s);
-            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)%(p_ptr)s->data;
+            const dtype_%(p_ptr)s* __restrict__ Dpp = (dtype_%(p_ptr)s*)PyArray_DATA(%(p_ptr)s);
-            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)%(z_data)s->data;
+            dtype_%(z_data)s* __restrict__ Dzd = (dtype_%(z_data)s*)PyArray_DATA(%(z_data)s);
-            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)%(z_ind)s->data;
+            dtype_%(z_ind)s* __restrict__ Dzi = (dtype_%(z_ind)s*)PyArray_DATA(%(z_ind)s);
-            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)%(z_ptr)s->data;
+            dtype_%(z_ptr)s* __restrict__ Dzp = (dtype_%(z_ptr)s*)PyArray_DATA(%(z_ptr)s);
-            const npy_intp Sdx = %(x)s->strides[1]/PyArray_DESCR(%(x)s)->elsize;
+            const npy_intp Sdx = PyArray_STRIDES(%(x)s)[1]/PyArray_DESCR(%(x)s)->elsize;
-            const npy_intp Sdy = %(y)s->strides[1]/PyArray_DESCR(%(y)s)->elsize;
+            const npy_intp Sdy = PyArray_STRIDES(%(y)s)[1]/PyArray_DESCR(%(y)s)->elsize;
-            const npy_intp Sdpd = %(p_data)s->strides[0] / PyArray_DESCR(%(p_data)s)->elsize;
+            const npy_intp Sdpd = PyArray_STRIDES(%(p_data)s)[0] / PyArray_DESCR(%(p_data)s)->elsize;
-            const npy_intp Sdpi = %(p_ind)s->strides[0] / PyArray_DESCR(%(p_ind)s)->elsize;
+            const npy_intp Sdpi = PyArray_STRIDES(%(p_ind)s)[0] / PyArray_DESCR(%(p_ind)s)->elsize;
-            const npy_intp Sdpp = %(p_ptr)s->strides[0] / PyArray_DESCR(%(p_ptr)s)->elsize;
+            const npy_intp Sdpp = PyArray_STRIDES(%(p_ptr)s)[0] / PyArray_DESCR(%(p_ptr)s)->elsize;
-            const npy_intp Sdzd = %(z_data)s->strides[0] / PyArray_DESCR(%(z_data)s)->elsize;
+            const npy_intp Sdzd = PyArray_STRIDES(%(z_data)s)[0] / PyArray_DESCR(%(z_data)s)->elsize;
-            const npy_intp Sdzi = %(z_ind)s->strides[0] / PyArray_DESCR(%(z_ind)s)->elsize;
+            const npy_intp Sdzi = PyArray_STRIDES(%(z_ind)s)[0] / PyArray_DESCR(%(z_ind)s)->elsize;
-            const npy_intp Sdzp = %(z_ptr)s->strides[0] / PyArray_DESCR(%(z_ptr)s)->elsize;
+            const npy_intp Sdzp = PyArray_STRIDES(%(z_ptr)s)[0] / PyArray_DESCR(%(z_ptr)s)->elsize;
            memcpy(Dzi, Dpi, PyArray_DIMS(%(p_ind)s)[0]*sizeof(dtype_%(p_ind)s));
            memcpy(Dzp, Dpp, PyArray_DIMS(%(p_ptr)s)[0]*sizeof(dtype_%(p_ptr)s));
@@ -1667,9 +1667,9 @@ PyErr_SetString(PyExc_NotImplementedError, "rank(y) != 2"); %(fail)s;}
                for (npy_int32 n_idx = Dpp[m * Sdpp]; n_idx < Dpp[(m+1)*Sdpp]; ++n_idx) {
                    const npy_int32 n = Dpi[n_idx * Sdpi]; // row index of non-null value for column K
-                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * m);
+                    const dtype_%(x)s* x_row = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * m);
-                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * n);
+                    const dtype_%(y)s* y_col = (dtype_%(y)s*)(PyArray_BYTES(%(y)s) + PyArray_STRIDES(%(y)s)[0] * n);
                    Dzd[n_idx * Sdzd] = Dpd[n_idx * Sdpd] * %(cdot)s((int*)&K, (const %(conv_type)s*)x_row, (int*)&Sdx, (const %(conv_type)s*)y_col, (int*)&Sdy);
                }

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3905,7 +3905,7 @@ class Reshape(Op):
            }
            Py_XDECREF(%(z)s);
            %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape,
-                PyArray_CORDER);
+                NPY_CORDER);
            if (!%(z)s)
            {
                //The error message should have been set by PyArray_Newshape

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -336,7 +336,7 @@ class DimShuffle(Op):
                'PyArray_UpdateFlags(%(res)s, NPY_ARRAY_UPDATE_ALL)',
                #we are making a view in both inplace and non-inplace cases
 """
-#if NPY_VERSION <= 0x01000009
+#if NPY_API_VERSION < 0x00000007
 PyArray_BASE(%(res)s) = (PyObject*)%(basename)s;
 #else
 PyArray_SetBaseObject(%(res)s, (PyObject*)%(basename)s);

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -118,14 +118,14 @@ class SoftmaxWithBias(gof.Op):
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
-        if ((PyArray_DESCR(%(x)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(x)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError, "not a float");
            %(fail)s;
        }
-        if ((PyArray_DESCR(%(b)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(b)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(b)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(b)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError, "b not float");
            %(fail)s;
@@ -264,15 +264,15 @@ class SoftmaxGrad(gof.Op):
        dy, sm = inp
        dx, = out
        return '''
-        if ((PyArray_DESCR(%(dy)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(dy)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(dy)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(dy)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError,
                 "types should be float or float64");
            %(fail)s;
        }
-        if ((PyArray_DESCR(%(sm)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(sm)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError,
                 "types should be float or float64");
@@ -395,23 +395,23 @@ class Softmax(gof.Op):
        #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
        init_decl = """
-        npy_intp* Nx = %(x)s->dimensions;
+        npy_intp* Nx = PyArray_DIMS(%(x)s);
-        if (%(x)s->nd != 2)
+        if (PyArray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
            %(fail)s;
        }
-        if ((%(x)s->descr->type_num != PyArray_DOUBLE) &&
+        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
-            (%(x)s->descr->type_num != PyArray_FLOAT))
+            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError, "not a float");
            %(fail)s;
        }
        if ((NULL == %(sm)s)
-            || (%(sm)s->dimensions[0] != %(x)s->dimensions[0])
+            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
-            || (%(sm)s->dimensions[1] != %(x)s->dimensions[1]))
+            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
        {
            if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
            %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
@@ -431,13 +431,13 @@ class Softmax(gof.Op):
            double sum = 0.0;
            bool  discount_max = false;
-            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(%(x)s->data + %(x)s->strides[0] * i);
+            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
-            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(%(sm)s->data + %(sm)s->strides[0] * i);
+            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
        """
        inside_row_loop = """
-            npy_intp Sx = %(x)s->strides[1]/sizeof(dtype_%(x)s);
+            npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-            npy_intp Ssm = %(sm)s->strides[1]/sizeof(dtype_%(sm)s);
+            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
            size_t row_max_j=0;
            dtype_%(sm)s row_max = x_i[0];
@@ -1018,15 +1018,15 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        y_idx_type = node.inputs[2].type.dtype_specs()[1]
        return """
-        if ((PyArray_DESCR(%(dnll)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(dnll)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(dnll)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError,
                 "dnll type should be float32 or float64");
            %(fail)s;
        }
-        if ((PyArray_DESCR(%(sm)s)->type_num != NPY_DOUBLE) &&
+        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
-            (PyArray_DESCR(%(sm)s)->type_num != NPY_FLOAT))
+            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
        {
            PyErr_SetString(PyExc_TypeError,
                 "sm type should be float32 or float64");

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -9,14 +9,13 @@ _logger = logging.getLogger("theano.tensor.subtensor")
 import numpy
 import theano
-from theano.compat.six import StringIO
 from theano.gradient import DisconnectedType
 from theano import gof
 from theano.gof import Apply, Constant, hashtype, Op, Type, MethodNotDefined
 from theano.gof.python25 import maxsize
 from theano.printing import pprint
 from theano import scalar as scal
-from theano.tensor.basic import (addbroadcast, clip, sum, exp,
+from theano.tensor.basic import (addbroadcast, clip,
                                 ARange, TensorType)
 from theano.tensor.elemwise import DimShuffle
 from theano.tensor.type_other import NoneConst, SliceType, make_slice
@@ -533,57 +532,29 @@ class Subtensor(Op):
        return {
            "c_prefix": "PyArray",
-                "update_flags": ("PyArray_UpdateFlags(%(view_name)s,"
-                " NPY_ARRAY_C_CONTIGUOUS|"
-                "NPY_ARRAY_F_CONTIGUOUS);"),
-                "set_data": "PyArray_set_data",
-                "set_dim": "PyArray_set_dim",
-                "set_stride": "PyArray_set_stride",
            "strides_mul": 1,
-                "view_name": "xview"}
+            }
    @staticmethod
-    def helper_c_code(node, name, inputs, outputs, sub, idx_list,
+    def helper_c_code(node, name, inputs, outputs, sub, idx_list, view_ndim,
                      c_prefix=None,
-                      update_flags=None,
-                      set_data=None,
-                      set_dim=None,
-                      set_stride=None,
                      strides_mul=None,
-                      view_name=None
                  ):
        """
-        The parameters c_prefix, update_flags, set_data, set_dim,
+        The parameters c_prefix are there to allow reusing this
-        set_stride and strides_mul are there to allow reusing this
        function on PyArray and CudaNdarray object.
+        This fct take as input the x,
        """
        default_args = Subtensor.default_helper_c_code_args()
-        if update_flags is None:
-            update_flags = default_args['update_flags']
-        if set_data is None:
-            set_data = default_args['set_data']
-        if set_dim is None:
-            set_dim = default_args['set_dim']
-        if set_stride is None:
-            set_stride = default_args['set_stride']
        if strides_mul is None:
            strides_mul = default_args['strides_mul']
        if c_prefix is None:
            c_prefix = default_args['c_prefix']
-        if view_name is None:
-            view_name = default_args['view_name']
-        #update_flags may depend on view_name
-        update_flags = update_flags % locals()
        #
        # two arrays are created in C code:
        # is_slice: len == ndim, 0 means int, 1 means slice
@@ -657,12 +628,7 @@ class Subtensor(Op):
        x, = inputs[:1]
        z, = outputs
-        xview = view_name
        rval = """
-        #define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
-        #define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
-        #define PyArray_set_data(obj, ptr, base) PyArray_BYTES(obj)=ptr
        // The subtensor is created by iterating over the dimensions
        // and updating stride, shape, and data pointers
@@ -674,32 +640,10 @@ class Subtensor(Op):
        int inner_ii = 0; // the current dimension of zview
        int outer_ii = 0; // current dimension of z
-        char* ptr = (char*) %(c_prefix)s_BYTES(%(xview)s);
+        // Argument of the view
+        ssize_t xview_offset = 0;
-        if ((%(c_prefix)s_DIMS(%(xview)s) == %(c_prefix)s_DIMS(%(x)s))
+        ssize_t xview_dims[%(view_ndim)s];
-            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
+        ssize_t xview_strides[%(view_ndim)s];
-        {
-            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
-                         "(with %%d dims) have the same dimensions"
-                         " pointers: %%p and %%p",
-                         %(c_prefix)s_NDIM(%(x)s),
-                         %(c_prefix)s_DIMS(%(xview)s),
-                         %(c_prefix)s_DIMS(%(x)s));
-            Py_XDECREF(%(xview)s);
-            %(fail)s;
-        }
-        if (%(c_prefix)s_STRIDES(%(xview)s) == %(c_prefix)s_STRIDES(%(x)s)
-            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
-        {
-            PyErr_Format(PyExc_ValueError, "x and %(xview)s"
-                         "(with %%d dims) have the same strides"
-                         " pointers: %%p and %%p",
-                         %(c_prefix)s_NDIM(%(x)s),
-                         %(c_prefix)s_STRIDES(%(xview)s),
-                         %(c_prefix)s_STRIDES(%(x)s));
-            Py_XDECREF(%(xview)s);
-            %(fail)s;
-        }
        for (; outer_ii < %(len_is_slice)s; ++outer_ii)
        {
@@ -719,10 +663,8 @@ class Subtensor(Op):
                // PySlice_GetIndicesEx in python source
                if (!step)
                {
-                    Py_DECREF(%(xview)s);
                    PyErr_Format(PyExc_ValueError,
                                 "slice step cannot be zero");
-                    Py_XDECREF(%(xview)s);
                    %(fail)s;
                }
@@ -771,11 +713,10 @@ class Subtensor(Op):
                assert (slicelength <= length);
-                ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
+                xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
                       %(strides_mul)s;
-                %(set_dim)s(%(xview)s, inner_ii, slicelength);
+                xview_dims[inner_ii] = slicelength;
-                %(set_stride)s(%(xview)s, inner_ii,
+                xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
-                               %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);
                inner_ii += 1;
                spec_pos += 3;
@@ -788,46 +729,41 @@ class Subtensor(Op):
                {
                    if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
                    {
-                        ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
+                        xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
                               %(strides_mul)s;
                    }
                    else
                    {
                        PyErr_Format(PyExc_IndexError,"index out of bounds");
-                        Py_XDECREF(%(xview)s);
                        %(fail)s;
                    }
                }
                else
                {
                    PyErr_Format(PyExc_IndexError,"index out of bounds");
-                    Py_XDECREF(%(xview)s);
                    %(fail)s;
                }
                spec_pos += 1;
            }
        }
-        %(set_data)s(%(xview)s, ptr, (PyObject*)NULL);
+        assert (inner_ii <= %(view_ndim)s);
-        assert (inner_ii <= %(c_prefix)s_NDIM(%(xview)s));
+        while (inner_ii < %(view_ndim)s)
-        while (inner_ii < %(c_prefix)s_NDIM(%(xview)s))
        {
            assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
-            %(set_dim)s(%(xview)s, inner_ii,
+            xview_dims[inner_ii] = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
-                        %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
+            xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii];
-            %(set_stride)s(%(xview)s, inner_ii,
-                           %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
            inner_ii += 1;
            outer_ii += 1;
        }
-        %(update_flags)s
        """ % locals()
        # print rval
        return rval
    @staticmethod
    def helper_c_code_cache_version():
-        return (5,)
+        return (6,)
    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
        if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
@@ -838,36 +774,45 @@ class Subtensor(Op):
        view_ndim = node.outputs[0].ndim
        fail = sub['fail']
+        decl = "PyArrayObject * xview = NULL;"
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                                       self.idx_list, view_ndim)
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
        Py_INCREF(PyArray_DESCR(%(x)s));
-        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
+        xview = (PyArrayObject*)PyArray_NewFromDescr(
                &PyArray_Type,
                PyArray_DESCR(%(x)s),
                %(view_ndim)s,
-                PyArray_DIMS(%(x)s),
+                xview_dims,
-                PyArray_STRIDES(%(x)s),
+                xview_strides,
-                PyArray_DATA(%(x)s),
+                PyArray_BYTES(%(x)s) + xview_offset,
-                %(x)s->flags,
+                PyArray_FLAGS(%(x)s),
                NULL);
+        assert (PyArray_NDIM(xview) == %(view_ndim)s);
        if (!xview)
        {
            %(fail)s;
        }
        """ % locals()
-        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
-                self.idx_list)
        finish_view = """
-        if (%(z)s) Py_DECREF(%(z)s);
+        //This is needed for NumPy 1.5, but not 1.7.2
+        PyArray_UpdateFlags(xview, NPY_ARRAY_C_CONTIGUOUS| NPY_ARRAY_F_CONTIGUOUS);
+        Py_XDECREF(%(z)s);
        Py_INCREF(py_%(x)s);
+#if NPY_API_VERSION < 0x00000007
        PyArray_BASE(xview) = py_%(x)s;
+#else
+        PyArray_SetBaseObject(xview, py_%(x)s);
+#endif
        assert(py_%(x)s == (PyObject*)%(x)s);
        %(z)s = xview;
        """ % locals()
-        return build_view + "{" + get_xview + "}" + finish_view
+        return decl + get_xview + build_view + finish_view
    def c_code_cache_version(self):
        hv = self.helper_c_code_cache_version()
@@ -1150,6 +1095,9 @@ class IncSubtensor(Op):
                         (x, y) + inputs,
                         [x.type()])
+    def decl_view(self):
+        return "PyArrayObject * zview = NULL;"
    def perform(self, node, inputs, out_):
        out, = out_
        x, y = inputs[:2]
@@ -1237,16 +1185,28 @@ class IncSubtensor(Op):
        }
        else
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            Py_XDECREF(%(z)s);
            %(z)s = %(copy_of_x)s;
        }
        """ % locals()
+        # get info needed to make zview: a view of %(z)s
+        helper_args = self.get_helper_c_code_args()
+        get_zview = Subtensor.helper_c_code(
+            node=node,
+            name=name,
+            inputs=outputs[:1] + inputs[2:],
+            outputs=outputs,
+            sub=sub,
+            idx_list=self.idx_list,
+            view_ndim=view_ndim,
+            ** helper_args
+        )
+        #Make a view on the output, as we will write into it.
        alloc_zview = self.make_view_array(z, view_ndim)
-        # On GPU, it takes two steps to make a view
-        link_zview = self.link_view_array(z, fail)
-        #Make a first view on the output, as we will write into it.
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
@@ -1255,21 +1215,7 @@ class IncSubtensor(Op):
        {
            %(fail)s;
        }
-        %(link_zview)s;
        """ % locals()
-        # make zview actually a view of %(z)s
-        helper_args = self.get_helper_c_code_args()
-        helper_args['view_name'] = 'zview'
-        get_zview = self.define_set_data() + \
-                Subtensor.helper_c_code(
-                node=node,
-                name=name,
-                inputs=outputs[:1] + inputs[2:],
-                outputs=outputs,
-                sub=sub,
-                idx_list=self.idx_list,
-                ** helper_args
-                )
        copy_into = self.copy_into("zview", y)
@@ -1289,12 +1235,12 @@ class IncSubtensor(Op):
            %(add_to_zview)s
        }
        """ % locals()
+        return (self.decl_view() +
-        return (copy_input_if_necessary
+                copy_input_if_necessary +
-                + build_view
+                get_zview +
-                + "{" + get_zview + "}"
+                build_view +
-                + make_modification
+                make_modification +
-                + "Py_DECREF(zview);"
+                "Py_DECREF(zview);"
                )
    def do_type_checking(self, node):
@@ -1344,16 +1290,18 @@ class IncSubtensor(Op):
        """
        return """Py_INCREF(PyArray_DESCR(%(x)s));
-        PyArrayObject * zview =
+        zview = (PyArrayObject*)PyArray_NewFromDescr(
-                (PyArrayObject*)PyArray_NewFromDescr(
                &PyArray_Type,
                PyArray_DESCR(%(x)s),
                %(view_ndim)s,
-                PyArray_DIMS(%(x)s),
+                xview_dims, //PyArray_DIMS(%(x)s),
-                PyArray_STRIDES(%(x)s),
+                xview_strides, //PyArray_STRIDES(%(x)s),
-                PyArray_DATA(%(x)s),
+                PyArray_BYTES(%(x)s) + xview_offset, //PyArray_DATA(%(x)s),
-                %(x)s->flags,
+                PyArray_FLAGS(%(x)s),
-                NULL)""" % locals()
+                NULL);
+        //This is needed for NumPy 1.5, but not 1.7.2
+        PyArray_UpdateFlags(zview, NPY_ARRAY_C_CONTIGUOUS| NPY_ARRAY_F_CONTIGUOUS);
+        """ % locals()
    def get_helper_c_code_args(self):
        """ Return a dictionary of arguments to pass to helper_c_code."""
@@ -1369,24 +1317,6 @@ class IncSubtensor(Op):
        """
        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
-    def define_set_data(self):
-        """ Returns C code used to define any macros used in the
-        set data argument to the helper C code. """
-        return ""
-    def link_view_array(self, x, fail):
-        """ Returns code to complete making zview a view of x"""
-        # On CPU there is nothing to do, make_view_array already did this
-        return ""
-    def set_view_base(self, x, fail):
-        """ Returns code to make zview be a correct view of x,
-        after helper_c_code is done messing with x"""
-        # On CPU there is nothing to do
-        return ""
    def add_to_zview(self, x, fail):
        """ Return C code to add x to zview. Should DECREF zview if the
        add fails."""
@@ -1567,7 +1497,7 @@ class AdvancedSubtensor1(Op):
        output_name = output_names[0]
        fail = sub['fail']
        return """
-            PyObject *indices;
+            PyArrayObject *indices;
            int i_type = PyArray_TYPE(%(i_name)s);
            if (i_type != NPY_INTP) {
                // Cast %(i_name)s to NPY_INTP (expected by PyArray_TakeFrom),
@@ -1602,13 +1532,13 @@ class AdvancedSubtensor1(Op):
                        %(fail)s;
                    }
                }
-                indices = PyArray_Cast(%(i_name)s, NPY_INTP);
+                indices = (PyArrayObject*) PyArray_Cast(%(i_name)s, NPY_INTP);
                if (indices == NULL) {
                    %(fail)s;
                }
            }
            else {
-                 indices = (PyObject *)%(i_name)s;
+                 indices = %(i_name)s;
                 Py_INCREF(indices);
            }
            if (%(output_name)s != NULL) {
@@ -1637,7 +1567,7 @@ class AdvancedSubtensor1(Op):
                }
            }
            %(output_name)s = (PyArrayObject*)PyArray_TakeFrom(
-                        %(a_name)s, indices, 0, %(output_name)s, NPY_RAISE);
+                        %(a_name)s, (PyObject*)indices, 0, %(output_name)s, NPY_RAISE);
            Py_DECREF(indices);
            if (%(output_name)s == NULL) %(fail)s;
        """ % locals()

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
-#from nose.plugins.skip import SkipTest
+from copy import copy
-#import traceback
+from unittest import TestCase
-import itertools
-import sys
-import theano.tensor as T
-from theano import tensor
-from theano.compat import PY3, exc_message
-from theano.gof.python25 import product as itertools_product
-from theano.gof.python25 import any
-from theano.printing import pp
 import numpy
-import theano
 from numpy import (arange, array, common_type, complex64, complex128, float32,
                  float64, newaxis, shape, transpose, zeros)
 from numpy.testing import assert_array_almost_equal
-#from numpy.testing import dec
-#from numpy.testing.noseclasses import KnownFailureTest
+import theano
+import theano.tensor as T
+from theano import tensor, Param, shared, config
+from theano.compat import exc_message
+from theano.gof.python25 import product as itertools_product
+from theano.gof.python25 import any
+from theano.printing import pp
 from theano.tensor.blas import (_dot22, _dot22scalar, res_is_a, _as_scalar,
                                _is_real_matrix, _gemm_canonicalize,
                                _factor_canonicalized, Gemm, Gemv,
                                gemm_inplace, gemm_no_inplace,
                                InconsistencyError, Ger, ger, ger_destructive)
-from unittest import TestCase
 from theano.tests import unittest_tools
-from copy import copy, deepcopy
-from theano import Param, shared, config
 from test_basic import (_approx_eq, as_tensor_variable, inplace_func,
        compile, inplace)
        #, constant, eval_outputs)
@@ -361,11 +353,8 @@ class t_gemm(TestCase):
                    z = tz.get_value(borrow=True, return_internal_type=True)
                    z[:, :, i] = z_i
-                    self.assertTrue(
+                    unittest_tools.assert_allclose(z_after[:, :, i],
-                            _approx_eq(z_after[:, :, i],
+                                                   tz.get_value(borrow=True)[:, :, i])
-                                       tz.get_value(borrow=True)[:, :, i]),
-                            (z_orig[:, :, i], z_after[:, :, i],
-                                z[:, :, i], z_after[:, :, i] - z[:, :, i]))
                tz_i = gemm_no_inplace(tz[:, :, i], ta, tx[
                    :, :, i], ty[:, :, i], tb)
@@ -374,11 +363,8 @@ class t_gemm(TestCase):
                        mode=compile.Mode(optimizer=None, linker=l))
                for j in xrange(3):
                    g_i()
-                    self.assertTrue(
+                    unittest_tools.assert_allclose(z_after[:, :, i],
-                            _approx_eq(z_after[:, :, i],
+                                                   tz.get_value(borrow=True)[:, :, i])
-                                       tz.get_value(borrow=True)[:, :, i]),
-                            (z_orig[:, :, i], z_after[:, :, i],
-                                z[:, :, i], z_after[:, :, i] - z[:, :, i]))
        t(C, A, B)
        t(C.transpose((1, 0, 2)), A, B)

--- a/theano/tensor/tests/test_inc_subtensor.py
+++ b/theano/tensor/tests/test_inc_subtensor.py
@@ -54,7 +54,7 @@ class Test_inc_subtensor(unittest.TestCase):
            else:
                expected_result[:, :val_sl2_end] += val_inc
-            self.assertTrue(numpy.array_equal(result, expected_result))
+            utt.assert_allclose(result, expected_result)
    def test_wrong_dims(self):
        a = tt.matrix()
@@ -122,7 +122,7 @@ class Test_inc_subtensor(unittest.TestCase):
            else:
                expected_result[:, sl3, :val_sl2_end] += val_inc
-            self.assertTrue(numpy.array_equal(result, expected_result))
+            utt.assert_allclose(result, expected_result)
    def test_grad_inc_set(self):
        def inc_slice(*s):

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -446,8 +446,9 @@ class TensorType(Type):
            %(fail)s
        }
        // We expect %(type_num)s
-        type_num_%(name)s = ((PyArrayObject*)py_%(name)s)->descr->type_num;
+        type_num_%(name)s = PyArray_TYPE((PyArrayObject*) py_%(name)s);
-        if (!PyArray_ISALIGNED(py_%(name)s)) {
+        if (!PyArray_ISALIGNED((PyArrayObject*) py_%(name)s)) {
+            PyArrayObject * tmp = (PyArrayObject*) py_%(name)s;
            PyErr_Format(PyExc_NotImplementedError,
                         "expected an aligned array of type %%ld "
                         "(%(type_num)s), got non-aligned array of type %%ld"
@@ -456,19 +457,19 @@ class TensorType(Type):
                         " and 3 last strides %%ld %%ld, %%ld.",
                         (long int) %(type_num)s,
                         (long int) type_num_%(name)s,
-                         (long int) PyArray_NDIM(py_%(name)s),
+                         (long int) PyArray_NDIM(tmp),
-                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+                         (long int) PyArray_NDIM(tmp) >= 3 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+        PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-3] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+                         (long int) PyArray_NDIM(tmp) >= 2 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+        PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-2] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+                         (long int) PyArray_NDIM(tmp) >= 1 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1,
+        PyArray_DIMS(tmp)[PyArray_NDIM(tmp)-1] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+                         (long int) PyArray_NDIM(tmp) >= 3 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+        PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-3] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+                         (long int) PyArray_NDIM(tmp) >= 2 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+        PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-2] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+                         (long int) PyArray_NDIM(tmp) >= 1 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1
+        PyArray_STRIDES(tmp)[PyArray_NDIM(tmp)-1] : -1
        );
            %(fail)s
        }
@@ -508,7 +509,7 @@ class TensorType(Type):
        {Py_XINCREF(py_%(name)s);}
-        if (!PyArray_ISALIGNED(py_%(name)s)) {
+        if (%(name)s && !PyArray_ISALIGNED((PyArrayObject*) py_%(name)s)) {
            PyErr_Format(PyExc_NotImplementedError,
                         "c_sync: expected an aligned array of type %%ld "
                         "(%(type_num)s), got non-aligned array of type %%ld"
@@ -517,19 +518,19 @@ class TensorType(Type):
                         " and 3 last strides %%ld %%ld, %%ld.",
                         (long int) %(type_num)s,
                         (long int) type_num_%(name)s,
-                         (long int) PyArray_NDIM(py_%(name)s),
+                         (long int) PyArray_NDIM(%(name)s),
-                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 3 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-3] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 2 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-2] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 1 ?
-        PyArray_DIMS(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1,
+        PyArray_DIMS(%(name)s)[PyArray_NDIM(%(name)s)-1] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 3 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 3 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-3] : -1,
+        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-3] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 2 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 2 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-2] : -1,
+        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-2] : -1,
-                         (long int) PyArray_NDIM(py_%(name)s) >= 1 ?
+                         (long int) PyArray_NDIM(%(name)s) >= 1 ?
-        PyArray_STRIDES(py_%(name)s)[PyArray_NDIM(py_%(name)s)-1] : -1
+        PyArray_STRIDES(%(name)s)[PyArray_NDIM(%(name)s)-1] : -1
        );
            %(fail)s
        }
@@ -555,7 +556,7 @@ class TensorType(Type):
    def c_code_cache_version(self):
        scalar_version = scal.Scalar(self.dtype).c_code_cache_version()
        if scalar_version:
-            return (10,) + scalar_version
+            return (11,) + scalar_version
        else:
            return ()

--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
@@ -919,9 +919,9 @@ class Fibby(theano.Op):
            if (!%(y)s)
                %(fail)s;
            {//New scope needed to make compilation work
-                dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data;
+                dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
-                dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data;
+                dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
-                for (int i = 2; i < %(x)s->dimensions[0]; ++i)
+                for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
                    y[i] = y[i-1]*y[i-2] + x[i];
            }
        """ % locals()