Merge pull request #939 from nouiz/c_code_gpusub

C code gpusub

Merge pull request #939 from nouiz/c_code_gpusub
c8934e50 · lamblin · 85f9247a · 02274d20 · c8934e50 · c8934e50
--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -403,7 +403,7 @@ class TestEquilibrium(object):
                 PatternSub((op4, 'x', 'y'), (op1, 'x', 'y')),
                 PatternSub((op3, (op2, 'x', 'y')), (op4, 'x', 'y'))
                 ],
-                max_use_ratio = 1. / len(g.nodes)) # each opt can only be applied once
+                max_use_ratio = 1. / len(g.apply_nodes)) # each opt can only be applied once
            opt.optimize(g)
        finally:
            _logger.setLevel(oldlevel)

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -544,12 +544,14 @@ def pydotprint(fct, outfile=None,

    if isinstance(fct, Function):
        mode = fct.maker.mode
-        fct_fgraph = fct.maker.fgraph
+        profile = getattr(fct, "profile", None)
        if (not isinstance(mode, ProfileMode)
            or not fct in mode.profile_stats):
            mode = None
+        fct_fgraph = fct.maker.fgraph
    elif isinstance(fct, gof.FunctionGraph):
        mode = None
+        profile = None
        fct_fgraph = fct
    else:
        raise ValueError(('pydotprint expects as input a theano.function or '
@@ -660,6 +662,14 @@ def pydotprint(fct, outfile=None,
            else:
                pf = time * 100 / mode.profile_stats[fct].fct_call_time
            prof_str = '   (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf)
+        elif profile:
+            time = profile.apply_time.get(node, 0)
+            #second, %fct time in profiler
+            if profile.fct_callcount == 0:
+                pf = 0
+            else:
+                pf = time * 100 / profile.fct_call_time
+            prof_str = '   (%.3fs,%.3f%%)' % (time, pf)
        applystr = str(node.op).replace(':', '_')
        applystr += prof_str
        if (applystr in all_strings) or with_ids:

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1911,7 +1911,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
        out[0] = x.reshape(tuple(shp))


-class GpuSubtensor(tensor.Subtensor, GpuOp):
+class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
    """
@@ -1920,19 +1920,16 @@ class GpuSubtensor(tensor.Subtensor, GpuOp):
        assert isinstance(x.type, CudaNdarrayType)
        rval = tensor.Subtensor.make_node(self, x, *inputs)
        otype = CudaNdarrayType(rval.outputs[0].type.broadcastable)
-        #We reverse the index here as a speed optimization
-        #this opt was saving 0.40e-05s of 3.49e05s
-        return Apply(self, [x] + list(reversed(rval.inputs[1:])), [otype()])
+        return Apply(self, [x] + rval.inputs[1:], [otype()])

    def perform(self, node, inputs, out_):
        out, = out_
        x = inputs[0]
-        indices = inputs[1:]
+        indices = list(reversed(inputs[1:]))

        def convert(entry):
            if isinstance(entry, Type):
                rval = indices.pop()
-                #the if take about .25e-05s
                if sys.version_info < (2, 5):
                    # Before Python 2.5, PySlice_GetIndicesEx requires
                    # Python int to be passed.
@@ -1955,6 +1952,59 @@ class GpuSubtensor(tensor.Subtensor, GpuOp):
            cdata = cdata[0]
        out[0] = x.__getitem__(cdata)

+    def c_code(self, node, name, inputs, outputs, sub):
+        x = inputs[0]
+        z, = outputs
+        view_ndim = node.outputs[0].ndim
+        fail = sub['fail']
+
+        build_view = """
+        //TODO: give this Op a second output so that this view can be cached
+        //TODO: alternatively, fix the memory leak on failure
+        CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
+        if (!xview)
+        {
+            %(fail)s;
+        }
+        if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
+                                       (PyObject*) NULL))
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+            Py_XDECREF(xview);
+            %(fail)s;
+        }
+        cnda_mark_dev_structure_dirty(xview);
+        #define CudaNdarray_set_device_data2(obj, ptr, base) \
+                CudaNdarray_set_device_data(obj, (float *)ptr, base)
+""" % locals()
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                                       self.idx_list,
+                                       c_prefix='CudaNdarray',
+                                       set_data='CudaNdarray_set_device_data2',
+                                       set_dim='CudaNdarray_set_dim',
+                                       set_stride='CudaNdarray_set_stride',
+                                       update_flags="", strides_mul=4)
+
+        finish_view = """
+        //Set the base only now
+
+        if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
+                                    %(x)s)){
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set"
+                         " the base of the view array");
+            Py_XDECREF(xview);
+            %(fail)s;
+        }
+
+        Py_XDECREF(%(z)s);
+        %(z)s = xview;
+        """ % locals()
+
+        return build_view + "{" + get_xview + "}" + finish_view
+

 class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -161,6 +161,14 @@ DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
 DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
 DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);

+// The following 4 macro are here to help make c code generator that work on
+// both PyArray and CudaNdarray.  This is at least used for Subtensor and
+// GpuSubtensor
+#define CudaNdarray_DIMS CudaNdarray_HOST_DIMS
+#define CudaNdarray_NDIM(self) self->nd
+#define CudaNdarray_STRIDES CudaNdarray_HOST_STRIDES
+#define CudaNdarray_BYTES CudaNdarray_DEV_DATA
+
 /**
 * Return the number of elements in the ndarray (product of the dimensions)
 */

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -7,6 +7,8 @@ import subprocess
 import sys
 import warnings

+import numpy
+
 import theano
 from theano.gof.cc import hash_from_file
 from theano.gof.cmodule import (std_libs, std_lib_dirs,
@@ -121,6 +123,17 @@ class NVCC_compiler(object):
            os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
        flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)

+        # numpy 1.7 deprecated the following macro but the didn't
+        # existed in the past
+        numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
+        if bool(numpy_ver < [1, 7]):
+            flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
+            flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
+            flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
+            flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
+            flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
+            flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
+
        # We compile cuda_ndarray.cu during import.
        # We should not add device properties at that time.
        # As the device is not selected yet!

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3952,9 +3952,21 @@ class Subtensor(Op):
        return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))

    @staticmethod
-    def helper_c_code(node, name, inputs, outputs, sub, idx_list):
-        if not isinstance(node.inputs[0].type, TensorType):
-            raise NotImplementedError()
+    def helper_c_code(node, name, inputs, outputs, sub, idx_list,
+                      c_prefix="PyArray",
+                      update_flags=("PyArray_UpdateFlags(xview,"
+                                    " NPY_ARRAY_C_CONTIGUOUS|"
+                                    "NPY_ARRAY_F_CONTIGUOUS);"),
+                      set_data='PyArray_set_data',
+                      set_dim='PyArray_set_dim',
+                      set_stride='PyArray_set_stride',
+                      strides_mul=1,
+                  ):
+        """The parameters c_prefix, update_flags, set_data, set_dim,
+        set_stride and strides_mul are there to allow reusing this
+        function on PyArray and CudaNdarray object.
+
+        """
        #
        # two arrays are created in C code:
        # is_slice: len == ndim, 0 means int, 1 means slice
@@ -4019,7 +4031,6 @@ class Subtensor(Op):
        assert len(is_slice) <= node.inputs[0].ndim, node.inputs[0].ndim

        len_is_slice = len(is_slice)
-        view_ndim = node.inputs[0].ndim - (numpy.asarray(is_slice) == 0).sum()

        len_subtensor_spec = spec_pos()

@@ -4030,6 +4041,10 @@ class Subtensor(Op):
        z, = outputs

        rval = """
+        #define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
+        #define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
+        #define PyArray_set_data(obj, ptr, base) PyArray_BYTES(obj)=ptr
+
        // The subtensor is created by iterating over the dimensions
        // and updating stride, shape, and data pointers

@@ -4040,39 +4055,30 @@ class Subtensor(Op):
        int inner_ii = 0; // the current dimension of zview
        int outer_ii = 0; // current dimension of z

-        //TODO: give this Op a second output so that this view can be cached
-        //TODO: alternatively, fix the memory leak on failure
-        Py_INCREF(PyArray_DESCR(%(x)s));
-        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
-                &PyArray_Type,
-                PyArray_DESCR(%(x)s),
-                %(view_ndim)s,
-                PyArray_DIMS(%(x)s),
-                PyArray_STRIDES(%(x)s),
-                PyArray_DATA(%(x)s),
-                %(x)s->flags,
-                NULL);
-        if (!xview)
-        {
-            %(fail)s;
-        }
+        char* ptr = (char*) %(c_prefix)s_BYTES(xview);

-        if ((PyArray_DIMS(xview) == PyArray_DIMS(%(x)s))
-            && (PyArray_DIMS(%(x)s) != NULL))
+        if ((%(c_prefix)s_DIMS(xview) == %(c_prefix)s_DIMS(%(x)s))
+            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
        {
            PyErr_Format(PyExc_ValueError, "x and xview"
                         "(with %%d dims) have the same dimensions"
                         " pointers: %%p and %%p",
-                         PyArray_NDIM(%(x)s), PyArray_DIMS(xview), PyArray_DIMS(%(x)s));
+                         %(c_prefix)s_NDIM(%(x)s),
+                         %(c_prefix)s_DIMS(xview),
+                         %(c_prefix)s_DIMS(%(x)s));
+            Py_XDECREF(xview);
            %(fail)s;
        }
-        if (PyArray_STRIDES(xview) == PyArray_STRIDES(%(x)s)
-            && (PyArray_DIMS(%(x)s) != NULL))
+        if (%(c_prefix)s_STRIDES(xview) == %(c_prefix)s_STRIDES(%(x)s)
+            && (%(c_prefix)s_DIMS(%(x)s) != NULL))
        {
            PyErr_Format(PyExc_ValueError, "x and xview"
                         "(with %%d dims) have the same strides"
                         " pointers: %%p and %%p",
-                         PyArray_NDIM(%(x)s), PyArray_STRIDES(xview), PyArray_STRIDES(%(x)s));
+                         %(c_prefix)s_NDIM(%(x)s),
+                         %(c_prefix)s_STRIDES(xview),
+                         %(c_prefix)s_STRIDES(%(x)s));
+            Py_XDECREF(xview);
            %(fail)s;
        }

@@ -4080,7 +4086,7 @@ class Subtensor(Op):
        {
            if (is_slice[outer_ii])
            {
-                npy_intp length = PyArray_DIMS(%(x)s)[outer_ii];
+                npy_intp length = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
                npy_intp slicelength;
                npy_intp start = subtensor_spec[spec_pos+0];
                npy_intp stop  = subtensor_spec[spec_pos+1];
@@ -4097,6 +4103,7 @@ class Subtensor(Op):
                    Py_DECREF(xview);
                    PyErr_Format(PyExc_ValueError,
                                 "slice step cannot be zero");
+                    Py_XDECREF(xview);
                    %(fail)s;
                }

@@ -4144,9 +4151,12 @@ class Subtensor(Op):
                }

                assert (slicelength <= length);
-                xview->data += PyArray_STRIDES(%(x)s)[outer_ii] * start;
-                PyArray_DIMS(xview)[inner_ii] = slicelength;
-                PyArray_STRIDES(xview)[inner_ii] = PyArray_STRIDES(%(x)s)[outer_ii] * step;
+
+                ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
+                       %(strides_mul)s;
+                %(set_dim)s(xview, inner_ii, slicelength);
+                %(set_stride)s(xview, inner_ii,
+                               %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);

                inner_ii += 1;
                spec_pos += 3;
@@ -4154,53 +4164,81 @@ class Subtensor(Op):
            else // tuple coord `outer_ii` is an int
            {
                int idx = subtensor_spec[spec_pos];
-                if (idx < 0) idx += PyArray_DIMS(%(x)s)[outer_ii];
+                if (idx < 0) idx += %(c_prefix)s_DIMS(%(x)s)[outer_ii];
                if (idx >= 0)
                {
-                    if (idx < PyArray_DIMS(%(x)s)[outer_ii])
+                    if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
                    {
-                        xview->data += PyArray_STRIDES(%(x)s)[outer_ii] * idx;
+                        ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
+                               %(strides_mul)s;
                    }
                    else
                    {
                        PyErr_Format(PyExc_IndexError,"index out of bounds");
+                        Py_XDECREF(xview);
                        %(fail)s;
                    }
                }
                else
                {
                    PyErr_Format(PyExc_IndexError,"index out of bounds");
+                    Py_XDECREF(xview);
                    %(fail)s;
                }

                spec_pos += 1;
            }
        }
-        assert (inner_ii <= PyArray_NDIM(xview));
-        while (inner_ii < PyArray_NDIM(xview))
+        %(set_data)s(xview, ptr, (PyObject*)NULL);
+        assert (inner_ii <= %(c_prefix)s_NDIM(xview));
+        while (inner_ii < %(c_prefix)s_NDIM(xview))
        {
-            assert (outer_ii < PyArray_NDIM(%(x)s));
-            PyArray_DIMS(xview)[inner_ii] = PyArray_DIMS(%(x)s)[outer_ii];
-            PyArray_STRIDES(xview)[inner_ii] = PyArray_STRIDES(%(x)s)[outer_ii];
+            assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
+            %(set_dim)s(xview, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
+            %(set_stride)s(xview, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
            inner_ii += 1;
            outer_ii += 1;
        }
-        PyArray_UpdateFlags(xview, NPY_ARRAY_C_CONTIGUOUS|NPY_F_CONTIGUOUS);
+        %(update_flags)s
        """ % locals()
        # print rval
        return rval

    @staticmethod
    def helper_c_code_cache_version():
-        return (4,)
+        return (5,)

    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
-        part0 = self.helper_c_code(node, name, inputs, outputs, sub,
-                self.idx_list)
+        if not isinstance(node.inputs[0].type, TensorType):
+            raise NotImplementedError()

        x = inputs[0]
        z, = outputs
-        part1 = """
+        view_ndim = node.outputs[0].ndim
+        fail = sub['fail']
+
+        build_view = """
+        //TODO: give this Op a second output so that this view can be cached
+        //TODO: alternatively, fix the memory leak on failure
+        Py_INCREF(PyArray_DESCR(%(x)s));
+        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
+                &PyArray_Type,
+                PyArray_DESCR(%(x)s),
+                %(view_ndim)s,
+                PyArray_DIMS(%(x)s),
+                PyArray_STRIDES(%(x)s),
+                PyArray_DATA(%(x)s),
+                %(x)s->flags,
+                NULL);
+        if (!xview)
+        {
+            %(fail)s;
+        }
+        """ % locals()
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                self.idx_list)
+
+        finish_view = """
        if (%(z)s) Py_DECREF(%(z)s);
        Py_INCREF(py_%(x)s);
        PyArray_BASE(xview) = py_%(x)s;
@@ -4208,7 +4246,7 @@ class Subtensor(Op):
        %(z)s = xview;
        """ % locals()

-        return part0 + part1
+        return build_view + "{" + get_xview + "}" + finish_view

    def c_code_cache_version(self):
        hv = self.helper_c_code_cache_version()
@@ -4216,7 +4254,7 @@ class Subtensor(Op):
        # have a versioned version of this op's C code.
        if len(hv) == 0:
            return ()
-        return (1, hv)
+        return (2, hv)

    def R_op(self, inputs, eval_points):
        # Subtensor is not differentiable wrt to its indices, therefore we
@@ -4476,6 +4514,8 @@ class IncSubtensor(Op):
        out[0] = x

    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
+        if not isinstance(node.inputs[0].type, TensorType):
+            raise NotImplementedError()

        if self.inplace:  # convert bool to int
            inplace = 1
@@ -4489,7 +4529,9 @@ class IncSubtensor(Op):
        else:
            op_is_set = 0
        fail = sub['fail']
-
+        view_ndim = (node.inputs[0].ndim -
+                     numpy.sum([not isinstance(idx, slice)
+                                for idx in self.idx_list]))
        copy_input_if_necessary = """
        if (%(inplace)s)
        {
@@ -4508,6 +4550,25 @@ class IncSubtensor(Op):
        }
        """ % locals()

+        #Make a first view on the output, as we will write into it.
+        build_view = """
+        //TODO: give this Op a second output so that this view can be cached
+        //TODO: alternatively, fix the memory leak on failure
+        Py_INCREF(PyArray_DESCR(%(z)s));
+        PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
+                &PyArray_Type,
+                PyArray_DESCR(%(z)s),
+                %(view_ndim)s,
+                PyArray_DIMS(%(z)s),
+                PyArray_STRIDES(%(z)s),
+                PyArray_DATA(%(z)s),
+                %(z)s->flags,
+                NULL);
+        if (!xview)
+        {
+            %(fail)s;
+        }
+        """ % locals()
        # make xview actually a view of %(z)s
        get_xview = Subtensor.helper_c_code(node, name,
                outputs[:1] + inputs[2:],
@@ -4541,7 +4602,8 @@ class IncSubtensor(Op):
        """ % locals()

        return (copy_input_if_necessary
-                + get_xview
+                + build_view
+                + "{" + get_xview + "}"
                + make_modification
                + "Py_DECREF(xview);"
                )
@@ -5385,7 +5447,7 @@ class Reshape(Op):
                // -- will err if this will downcast. This could happen if the
                // -- user pass an int64 dtype, but npy_intp endup being int32.
                new_dims[ii] = ((dtype_%(shp)s*)(
-                        PyArray_DATA(%(shp)s) + ii * PyArray_STRIDES(%(shp)s)[0]))[0];
+                        PyArray_BYTES(%(shp)s) + ii * PyArray_STRIDES(%(shp)s)[0]))[0];
            }
            Py_XDECREF(%(z)s);
            %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape,

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -159,9 +159,9 @@ class SoftmaxWithBias(gof.Op):
            double sum = 0.0;
            bool  discount_max = false;

-            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_DATA(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
-            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_DATA(%(b)s));
-            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
+            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
+            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
        """

        inside_row_loop = """
@@ -306,11 +306,11 @@ class SoftmaxGrad(gof.Op):

        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
        {
-            const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_DATA(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i);
+            const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_BYTES(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i);
            npy_intp Sdy = PyArray_STRIDES(%(dy)s)[1]/sizeof(dtype_%(dy)s);
-            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
-            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_DATA(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
+            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
            npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);

            double sum_dy_times_sm = 0.;
@@ -825,9 +825,9 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
                """,
                begin_row_loop,
                """
-            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_DATA(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
-            dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_DATA(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i);
-            %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_DATA(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i);
+            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
+            dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_BYTES(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i);
+            %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_BYTES(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i);
                """,
                inside_row_loop,
                """
@@ -977,14 +977,14 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):

        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
        {
-            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_DATA(%(dnll)s) + PyArray_STRIDES(%(dnll)s)[0] * i))[0];
+            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + PyArray_STRIDES(%(dnll)s)[0] * i))[0];

-            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_DATA(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
+            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];

-            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);

-            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_DATA(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
+            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
            npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);

            for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2565,15 +2565,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.fail()

    def test1_ok_range_finite(self):
-        n = self.shared(numpy.ones(3, dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(3, dtype=self.dtype))
        t = n[0:2]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(tval[1] == 5.0)
+        self.assertTrue((tval == [0, 1]).all())

    def test2_ok_range_finite(self):
-        n = self.shared(numpy.ones((3, 4), dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((3, 4)))
        # Also check negative index
        for idx in [(slice(0, 2), 3), ((slice(0, 2), -1)), (slice(0, 2), -4)]:
            t = n[idx]  # l]#0:2,3]
@@ -2612,25 +2612,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):

    def test1_ok_range_infinite(self):
        #Subtensor.debug = True
-        n = self.shared(numpy.ones(3, dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(3, dtype=self.dtype))
        t = n[1:]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(tval[1] == 5.0)
+        self.assertTrue((tval == [1.0, 2.0]).all())

    def test1_ok_strided(self):
-        n = self.shared(numpy.ones(5, dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(5, dtype=self.dtype))
        t = n[1::2]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(tval[1] == 5.0)
+        self.assertTrue((tval == [1.0, 3.0]).all())

        t = n[0:-1:2]  # 0 to 1 from the end stepping by 2
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(tval[1] == 5.0)
+        self.assertTrue((tval == [0.0, 2.0]).all())

    def test2_err_bounds0(self):
        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
@@ -2671,8 +2671,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            sys.stderr = old_stderr

    def test2_ok_elem(self):
-        n = self.shared(numpy.asarray(range(6), dtype=self.dtype).
-            reshape((2, 3)))
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
        t = n[0, 2]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
@@ -2680,8 +2679,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(numpy.all(tval == 2))

    def test2_ok_row(self):
-        n = self.shared(numpy.asarray(range(6), dtype=self.dtype).
-            reshape((2, 3)))
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
        t = n[1]
        self.assertFalse(any(n.type.broadcastable))
        self.assertTrue(isinstance(t.owner.op, Subtensor))
@@ -2690,25 +2688,24 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(numpy.all(tval == [3, 4, 5]))

    def test2_ok_col(self):
-        n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
        t = n[:, 0]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        self.assertFalse(any(n.type.broadcastable))
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(numpy.all(tval == 5.0))
+        self.assertTrue(numpy.all(tval == [0, 3]))

    def test2_ok_rows_finite(self):
-        n = self.shared(numpy.ones((4, 3), dtype=self.dtype) * 5)
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
        t = n[1:3, 0]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
        self.assertTrue(tval.shape == (2,))
-        self.assertTrue(numpy.all(tval == 5.0))
+        self.assertTrue(numpy.all(tval == [3, 6]))

    def test2_ok_cols_infinite(self):
-        n = self.shared(numpy.asarray(range(12), dtype=self.dtype).
-            reshape((4, 3)))
+        n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
        t = n[1, 2:]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
@@ -2716,8 +2713,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(numpy.all(tval == 5))

    def test2_ok_strided(self):
-        n = self.shared(numpy.asarray(range(20), dtype=self.dtype).
-            reshape((4, 5)))
+        n = self.shared(numpy.arange(20, dtype=self.dtype).reshape((4, 5)))
        t = n[1:4:2, 1:5:2]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
@@ -2725,8 +2721,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(numpy.all(tval == [[6, 8], [16, 18]]))

    def test3_ok_mat(self):
-        n = self.shared(numpy.asarray(range(24), dtype=self.dtype).
-            reshape((2, 3, 4)))
+        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
        t = n[0, 0, 0]
        self.assertTrue(isinstance(t.owner.op, Subtensor))
        tval = self.eval_output_and_check(t)
@@ -2745,8 +2740,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        """
        newaxis = numpy.newaxis

-        n = self.shared(numpy.asarray(range(24), dtype=self.dtype).
-            reshape((2, 3, 4)))
+        n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
        assert n.ndim == 3

        n4 = n[newaxis, :, :, :]