Merge pull request #1001 from abergeron/compyte

Support for a new type based on compyte in theano

Merge pull request #1001 from abergeron/compyte
44f9d0f7 · Frédéric Bastien · 93a7a5e3 · d935ba06 · 44f9d0f7 · 44f9d0f7
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -91,6 +91,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):

        theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()

+if config.device.startswith('cuda') or config.device.startswith('opencl') or \
+        config.gpuarray.init_device != '':
+    import theano.sandbox.gpuarray
+
 # Use config.numpy to call numpy.seterr
 import numpy


--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -2,9 +2,8 @@ import os
 import logging
 import subprocess

-from theano.configparser import (
-        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
-        TheanoConfigParser)
+from theano.configparser import (AddConfigVar, BoolParam, ConfigParam, EnumStr,
+                                 IntParam, StrParam, TheanoConfigParser)
 from theano.misc.cpucount import cpuCount
 from theano.misc.windows import call_subprocess_Popen

@@ -44,20 +43,42 @@ AddConfigVar('int_division',
 # gpu means let the driver select the gpu. Needed in case of gpu in
 # exclusive mode.
 # gpuX mean use the gpu number X.
+class DeviceParam(ConfigParam):
+    def __init__(self, default, *options, **kwargs):
+        self.default = default
+
+        def filter(val):
+            if val.startswith('cpu') or val.startswith('gpu') \
+                    or val.startswith('opencl') or val.startswith('cuda'):
+                return val
+            else:
+                raise ValueError(('Invalid value ("%s") for configuration '
+                                  'variable "%s". Valid options start with '
+                                  'one of "cpu", "gpu", "opencl", "cuda"'
+                                  % (val, self.fullname)))
+        over = kwargs.get("allow_override", True)
+        super(DeviceParam, self).__init__(default, filter, over)
+
+    def __str__(self):
+        return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
+
 AddConfigVar('device',
        ("Default device for computations. If gpu*, change the default to try "
         "to move computation to it and to put shared variable of float32 "
         "on it. Do not use upper case letters, only lower case even if "
         "NVIDIA use capital letters."),
-        EnumStr('cpu', 'gpu',
-            'gpu0', 'gpu1', 'gpu2', 'gpu3',
-            'gpu4', 'gpu5', 'gpu6', 'gpu7',
-            'gpu8', 'gpu9', 'gpu10', 'gpu11',
-            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False),
+        DeviceParam('cpu', allow_override=False),
        in_c_key=False,
        )

+AddConfigVar('gpuarray.init_device',
+             """
+             Device to initialize for gpuarray use without moving
+             computations automatically.
+             """,
+             StrParam(''),
+             in_c_key=False)
+
 AddConfigVar('init_gpu_device',
        ("Initialize the gpu device to use, works only if device=cpu. "
         "Unlike 'device', setting this option will NOT move computations, "

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
+import logging
+
+import theano
+from theano.configparser import config
+from theano.compile import optdb
+
+_logger_name = 'theano.sandbox.gpuarray'
+_logger = logging.getLogger(_logger_name)
+_logger.setLevel(logging.WARNING)
+
+error = _logger.error
+info = _logger.info
+
+pygpu_activated = False
+try:
+    import pygpu
+    import pygpu.gpuarray
+except ImportError:
+    pygpu = None
+
+# This is for documentation not to depend on the availability of pygpu
+from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
+                  GpuArraySharedVariable, gpuarray_shared_constructor)
+import opt
+
+
+def init_dev(dev):
+    global pygpu_activated
+    context = pygpu.init(dev)
+    pygpu.set_default_context(context)
+    pygpu_activated = True
+
+if pygpu:
+    try:
+        if (config.device.startswith('cuda') or
+            config.device.startswith('opencl')):
+            init_dev(config.device)
+            import theano.compile
+            theano.compile.shared_constructor(gpuarray_shared_constructor)
+            optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
+        elif config.gpuarray.init_device != '':
+            init_dev(config.gpuarray.init_device)
+    except Exception:
+        error("Could not initialize pygpu, support disabled", exc_info=True)
+else:
+    if (config.gpuarray.init_device != '' or
+        config.device.startswith('opencl') or
+        config.device.startswith('cuda')):
+        error("pygpu was configured but could not be imported", exc_info=True)
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
+import os
+
+import numpy
+
+import theano
+from theano import Op, Type, Apply, Variable, Constant
+from theano import tensor, scalar, config
+from theano.scalar import Scalar
+
+from theano.gof.python25 import all, any
+
+try:
+    import pygpu
+    from pygpu import gpuarray, elemwise
+except ImportError:
+    pass
+
+from type import GpuArrayType
+
+def as_gpuarray_variable(x):
+    if hasattr(x, '_as_GpuArrayVariable'):
+        return x._as_GpuArrayVariable()
+    # TODO we need to have the cuda -> gpu path taken care of.
+    tensor_x = tensor.as_tensor_variable(x)
+    return gpu_from_host(tensor_x)
+
+
+def as_gpuarray(x):
+    return gpuarray.array(x, copy=False)
+
+
+class HostFromGpu(Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return 'HostFromGpu(gpuarray)'
+
+    def make_node(self, x):
+        if not isinstance(x.type, GpuArrayType):
+            raise TypeError(x)
+        return Apply(self, [x],
+                     [tensor.TensorType(dtype=x.dtype,
+                                        broadcastable=x.broadcastable,)()])
+
+    def perform(self, node, inp, out):
+        x, = inp
+        z, = out
+        z[0] = numpy.asarray(x)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        return """
+        GpuArray %(name)s_ga_s;
+        GpuArray *%(name)s_ga = NULL;
+        int %(name)serr;
+        PyArray_Descr *%(name)s_dtype;
+        if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
+            if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
+                PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
+                %(fail)s;
+            }
+            %(name)s_ga = &%(name)s_ga_s;
+        } else {
+            %(name)s_ga = &%(inp)s->ga;
+        }
+        %(name)s_dtype = typecode_to_dtype(%(inp)s->ga.typecode);
+        Py_XDECREF(%(out)s);
+        // PyArray_Empty below steals a reference to the dtype we pass it
+        // so we need an extra one to spare.
+        Py_INCREF(%(name)s_dtype);
+        %(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
+                                (npy_intp *)%(inp)s->ga.dimensions,
+                                %(name)s_dtype,
+                                (%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
+                                !(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
+        if (%(out)s == NULL) {
+            if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
+            %(fail)s
+        }
+        %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
+                                    PyArray_NBYTES(%(out)s),
+                                    %(name)s_ga);
+        if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
+        if (%(name)serr != GA_NO_ERROR) {
+            PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
+            %(fail)s
+        }
+        """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
+                'out': outputs[0]}
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def grad(self, inputs, grads):
+        gz, = grads
+        return [gpu_from_host(gz)]
+                                                   
+    def R_op(self, inputs, eval_points):
+        ev, = eval_points
+        if isinstance(ev, tensor.TensorType):
+            return [gpu_from_host(ev)]
+        else:
+            return [ev]
+
+    def infer_shape(self, node, xshp):
+        return xshp
+
+
+host_from_gpu = HostFromGpu()
+
+
+class GpuFromHost(Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return 'GpuFromHost(gpuarray)'
+
+    def make_node(self, x):
+        if not isinstance(x.type, tensor.TensorType):
+            raise TypeError(x)
+        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
+                                              dtype=x.dtype)()])
+
+    def perform(self, node, inp, out):
+        x, = inp
+        z, = out
+        type = node.outputs[0].type
+        z[0] = gpuarray.array(x)
+
+    def grad(self, inputs, grads):
+        gz, = grads
+        return [host_from_gpu(as_gpuarray_variable(gz))]
+
+    def R_op(self, inputs, eval_points):
+        ev, = eval_points
+        if isintance(ev, GpuArrayType):
+            return [host_from_gpu(ev)]
+        else:
+            return ev
+
+    def infer_shape(self, node, xshp):
+        return xshp
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        return """
+        PyArrayObject *%(name)s_tmp;
+        int %(name)serr;
+        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
+        if (%(name)s_tmp == NULL) {
+            // PyArray_GETCONTIGUOUS sets an error message if it fails
+            %(fail)s
+        }
+        Py_XDECREF(%(out)s);
+        %(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
+        if (%(out)s == NULL) {
+            Py_DECREF(%(name)s_tmp);
+            // new_GpuArray calls __new__ which will set an error message
+            // if it returns NULL.
+            %(fail)s
+        }
+        %(name)serr = GpuArray_empty(&%(out)s->ga,
+                                     GpuArray_default_context()->ops,
+                                     GpuArray_default_context()->ctx,
+                                     get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+                                     PyArray_NDIM(%(inp)s),
+                                     (size_t *)PyArray_DIMS(%(inp)s),
+                                     GA_C_ORDER);
+        if (%(name)serr != GA_NO_ERROR) {
+            Py_DECREF(%(name)s_tmp);
+            Py_DECREF(%(out)s);
+            %(out)s = NULL;
+            PyErr_SetString(PyExc_MemoryError, "Can't allocate device memory for result.");
+            %(fail)s
+        }
+        %(name)serr = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
+                                     PyArray_NBYTES(%(name)s_tmp));
+        Py_DECREF(%(name)s_tmp);
+        if (%(name)serr != GA_NO_ERROR) {
+            Py_DECREF(%(out)s);
+            PyErr_SetString(PyExc_RuntimeError, "Could not copy array data to device");
+            %(fail)s
+        }
+        """ % {'name': name, 'inp': inputs[0],
+               'out': outputs[0], 'fail': sub['fail']}
+
+    def c_code_cache_version(self):
+        return (1,)
+
+gpu_from_host = GpuFromHost()
+
+
+class GpuFromCuda(Op):
+    view_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return 'GpuFromCuda'
+
+    def make_node(self, x):
+        from theano.sandbox.cuda import CudaNdarrayType
+        if not isinstance(x.type, CudaNdarrayType):
+            raise TypeError(x)
+        return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
+                                              dtype=x.dtype)()])
+
+    def perform(self, node, inp, out):
+        x, = inp
+        z, = out
+        z[0] = gpuarray.array(numpy.asarray(x))
+
+    def grad(self, inputs, grads):
+        gz, = grads
+        return [cuda_from_gpu(gz)]
+
+    def R_op(self, inputs, eval_points):
+        ev, = eval_points
+        if isintance(ev, GpuArrayType):
+            return [cuda_from_gpu(ev)]
+        else:
+            return ev
+
+    def infer_shape(self, node, xshp):
+        return xshp
+
+    def c_headers(self):
+        return ['<cuda_ndarray.cuh>', '<compyte/extension.h>',
+                '<compyte/types.h>', '<cuda.h>']
+
+    def c_header_dirs(self):
+        import cuda_ndarray
+        ret = [os.path.dirname(cuda_ndarray.__file__)]
+        cuda_root = config.cuda.root
+        if cuda_root:
+            ret.append(os.path.join(cuda_root, 'include'))
+        return ret
+
+    def c_lib_dirs(self):
+        import cuda_ndarray
+        ret = [os.path.dirname(cuda_ndarray.__file__)]
+        cuda_root = config.cuda.root
+        if cuda_root:
+            ret.append(os.path.join(cuda_root, 'lib'))
+        return ret
+
+    def c_libraries(self):
+        return ['cudart', 'cublas', 'cuda']
+
+    def c_support_code(self):
+        return """
+        CUcontext (*cuda_get_ctx)(void *ctx);
+        gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
+        """
+
+    def c_init_code(self):
+        return ['cuda_get_ctx = (CUcontext (*)(void *))compyte_get_extension("cuda_get_ctx");',
+                'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))compyte_get_extension("cuda_make_buf");']
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        return """
+        int %(name)serr;
+        gpudata *%(name)sdata;
+        CUcontext %(name)scur;
+        size_t *%(name)sdims;
+        ssize_t *%(name)sstr;
+
+        cuCtxGetCurrent(&%(name)scur);
+        if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
+            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
+            %(fail)s
+        }
+        %(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
+        if (%(name)sdims == NULL) {
+            PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
+            %(fail)s
+        }
+        %(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
+        if (%(name)sstr == NULL) {
+            free(%(name)sdims);
+            PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
+            %(fail)s
+        }
+
+        for (unsigned int i = 0; i < %(in)s->nd; i++) {
+            %(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
+            %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
+        }
+
+        Py_XDECREF(%(out)s);
+        %(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
+        if (%(out)s == NULL) {
+            free(%(name)sdims);
+            free(%(name)sstr);
+            %(fail)s
+        }
+
+        %(name)sdata = cuda_make_buf(GpuArray_default_context()->ctx,
+                                     (CUdeviceptr)%(in)s->devdata,
+                                     ((size_t)%(in)s->data_allocated)*4);
+        if (%(name)sdata == NULL) {
+            Py_DECREF(%(out)s);
+            free(%(name)sdims);
+            free(%(name)sstr);
+            PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
+            %(fail)s
+        }
+        %(name)serr = GpuArray_fromdata(&%(out)s->ga,
+                                        GpuArray_default_context()->ops,
+                                        %(name)sdata, 0, GA_FLOAT, %(in)s->nd,
+                                        %(name)sdims, %(name)sstr, 1);
+        free(%(name)sdims);
+        free(%(name)sstr);
+        if (%(name)serr != GA_NO_ERROR) {
+            Py_DECREF(%(out)s);
+            PyErr_SetString(PyExc_MemoryError, "Could not allocate GpuArray structure.");
+            %(fail)s
+        }
+        Py_INCREF(%(in)s);
+        %(out)s->base = (PyObject *)%(in)s;
+        """ % {'name':name, 'in': inputs[0], 'out': outputs[0],
+               'fail': sub['fail']}
+
+    def c_code_cache_version(self):
+        return (1,)
+
+gpu_from_cuda = GpuFromCuda()
+
+
+class CudaFromGpu(Op):
+    view_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __str__(self):
+        return 'CudaFromGpu'
+
+    def make_node(self, x):
+        from theano.sandbox.cuda import CudaNdarrayType
+        if not isinstance(x.type, GpuArrayType):
+            raise TypeError(x)
+        if x.type.dtype != 'float32':
+            raise TypeError(x)
+        return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
+
+    def perform(self, node, inp, out):
+        from theano.sandbox.cuda import filter as cuda_filter
+        x, = inp
+        z, = out
+        z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
+                           tuple([0] * x.ndim), 0, z[0])
+
+    def grad(self, inputs, grads):
+        gz, = grads
+        return [gpu_from_cuda(gz)]
+
+    def R_op(self, inputs, eval_points):
+        from theano.sandbox.cuda import CudaNdArrayType
+        ev, = eval_points
+        if (isinstance(ev, CudaNdarrayType)):
+            return [gpu_from_cuda(ev)]
+        else:
+            return [ev]
+
+    def infer_shape(self, node, shp):
+        return shp
+
+    def c_headers(self):
+        return ['<cuda_ndarray.cuh>', '<compyte/extension.h>', '<cuda.h>']
+
+    def c_header_dirs(self):
+        import cuda_ndarray
+        ret = [os.path.dirname(cuda_ndarray.__file__)]
+        cuda_root = config.cuda.root
+        if cuda_root:
+            ret.append(os.path.join(cuda_root, 'include'))
+        return ret
+
+    def c_lib_dirs(self):
+        import cuda_ndarray
+        ret = [os.path.dirname(cuda_ndarray.__file__)]
+        cuda_root = config.cuda.root
+        if cuda_root:
+            ret.append(os.path.join(cuda_root, 'lib'))
+        return ret
+
+    def c_libraries(self):
+        return ['cudart', 'cublas', 'cuda']
+
+    def c_support_code(self):
+        return """
+        CUcontext (*cuda_get_ctx)(void *ctx);
+        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
+        """
+
+    def c_init_code(self):
+        return ['cuda_get_ctx = (CUcontext (*)(void *ctx))compyte_get_extension("cuda_get_ctx");',
+                'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        return """
+        int %(name)serr = 0, %(name)si;
+        CUcontext %(name)scur;
+
+        cuCtxGetCurrent(&%(name)scur);
+        if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
+            PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
+            %(fail)s
+        }
+
+        Py_XDECREF(%(out)s);
+        %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
+            CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
+            CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
+        }
+        %(name)serr = CudaNdarray_set_device_data(%(out)s,
+          (float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
+                                          (PyObject *)%(inp)s);
+        if (%(name)serr) {
+           %(fail)s
+        }
+        """ % {'name': name, 'inp': inputs[0], 'out': outputs[0],
+               'fail': sub['fail']}
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+cuda_from_gpu = CudaFromGpu()
+
+
+class GpuAlloc(Op):
+    def __str__(self):
+        return 'GpuAlloc'
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def make_node(self, value, *shape):
+        v = as_gpuarray_variable(value)
+        sh = [tensor.as_tensor_variable(s) for s in shape]
+        bcast = []
+        if v.ndim > len(shape):
+            raise TypeError(
+                'GpuAlloc value has more dimensions than arguments',
+                value.ndim, len(shape))
+        for i, s in enumerate(sh):
+            if s.type.dtype[:3] not in ('int', 'uint'):
+                raise TypeError('Shape arguments must be integers', s)
+            try:
+                const_shp = tensor.get_scalar_constant_value(s)
+            except tensor.NotScalarConstantError:
+                const_shp = None
+            bcast.append(numpy.all(1 == const_shp))
+        otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
+        return Apply(self, [v] + sh, [otype()])
+
+    def perform(self, node, inputs, outs):
+        out, = outs
+        v = inputs[0]
+        sh = tuple(map(int, inputs[1:]))
+        if out[0] is None or out[0].shape != sh:
+            out[0] = gpuarray.empty(sh, dtype=v.dtype)
+        out[0][...] = v
+
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs[1:]]
+
+    def grad(self, input, grads):
+        return [None for i in inputs]
+
+    def do_constant_folding(self, node):
+        if not getattr(node.ouputs[0], 'clients', []):
+            return False
+        for client in node.outputs[0].clients:
+            if client[0] == 'output':
+                return False
+        return True
+
+gpu_alloc = GpuAlloc()
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
+import numpy
+from theano import Op, Apply, scalar
+
+try:
+    from pygpu.tools import ScalarArg, ArrayArg
+    from pygpu.elemwise import ElemwiseKernel
+except ImportError:
+    pass
+
+from basic_ops import as_gpuarray_variable
+from type import GpuArrayType
+
+from theano.gof.utils import MethodNotDefined
+
+def _is_scalar(v):
+    False
+
+def make_argument(v, name):
+    if _is_scalar(v):
+        return ScalarArg(numpy.dtype(v.type.dtype), name)
+    else:
+        return ArrayArg(numpy.dtype(v.type.dtype), name)
+
+def ensure_out(o, ref):
+    if o is None:
+        return ref._empty_like_me()
+    else:
+        return o
+
+class GpuElemwise(Op):
+    nin = property(lambda self: self.scalar_op.nin)
+    nout = property(lambda self: self.scalar_op.nout)
+
+    def __init__(self, scalar_op):
+        self.scalar_op = scalar_op
+        self.destroy_map = {}
+
+    def __getstate__(self):
+        d = copy.copy(self.__dict__)
+        d.pop('__epydoc_asRoutine', None)
+        d.pop('_hashval')
+        return d
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        self._rehash()
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.scalar_op == other.scalar_op)
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.scalar_op)
+
+    def __str__(self):
+        return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
+
+    def make_node(self, *inputs):
+        _inputs = [as_gpuarray_variable(i) for i in inputs]
+        if self.nin > 0 and len(_inputs) != self.nin:
+            raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
+        for i in _inputs[1:]:
+            if i.type.ndim != inputs[0].type.ndim:
+                raise TypeError('mismatched rank amongst inputs')
+
+        broadcastable = []
+        for d in xrange(_inputs[0].type.ndim):
+            bcast_d = True
+            for i in _inputs:
+                if not i.type.broadcastable[d]:
+                    bcast_d = False
+                    break
+            broadcastable.append(bcast_d)
+        assert len(broadcastable) == _inputs[0].type.ndim
+
+        assert self.nout > 0
+        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
+                enumerate(inputs)]
+        scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
+                          
+        res = Apply(self, _inputs, 
+                    [GpuArrayType(o.dtype, broadcastable)()
+                     for o in self.scalar_op.output_types(scal_ins)])
+
+        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
+                enumerate(res.outputs)]
+        scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
+
+        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
+                          [o() for o in scal_out])
+
+        kcode = self.scalar_op.c_code(fake_node, 'kcode',
+                                      [i.expr() for i in inps],
+                                      [o.expr() for o in outs],
+                                      sub=dict(fail='return;'))
+        res.tag.kcode = kcode
+
+        try:
+            code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
+            if code:
+                raise SupportCodeError()
+        except MethodNotDefined:
+            pass
+
+        support_code = ""
+        try:
+            support_code += self.scalar_op.c_support_code()
+        except MethodNotDefined:
+            pass
+
+        if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
+            # Avoid the C++ complex struct
+            raise SupportCodeError()
+
+        k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
+        res.tag.kernel = k
+
+        return res
+
+    def perform(self, node, inps, out):
+        k = node.tag.kernel
+        outs = [ensure_out(o[0], inps[0]) for o in out]
+
+        # the dict call is there to avoid syntax error in python <= 2.5
+        k(*(inps+outs), **dict(broadcast=True))
+
+        for o, og in zip(out, outs):
+            o[0] = og
+
+class SupportCodeError(Exception):
+    """
+    We do not support certain things (such as the C++ complex struct)
+    """
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
+import theano, numpy
+from theano import tensor
+from theano.compile import optdb
+from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
+                        Optimizer, toolbox, DestroyHandler,
+                        InconsistencyError, EquilibriumOptimizer)
+
+from theano.gof.python25 import all, any
+from theano.sandbox.gpuarray.type import GpuArrayType
+
+from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
+from elemwise import GpuElemwise, _is_scalar
+
+gpu_optimizer = EquilibriumDB()
+gpu_cut_copies = EquilibriumDB()
+
+gpu_seqopt = SequenceDB()
+
+gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
+                    'fast_run', 'inplace', 'gpuarray')
+gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
+                    'fast_run', 'gpuarray')
+
+# do not add 'fast_run' to these two as this would always enable gpuarray mode
+optdb.register('gpuarray_opt', gpu_seqopt,
+               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
+               'gpuarray')
+
+def register_opt(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
+        return local_opt
+    return f
+
+register_opt()(theano.tensor.opt.local_track_shape_i)
+
+class InputToGpuOptimizer(Optimizer):
+    "Transfer the input to the gpu to start the rolling wave."
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(toolbox.ReplaceValidate())
+        fgraph.attach_feature(DestroyHandler())
+
+    def apply(self, fgraph):
+        for input in fgraph.inputs:
+            if isinstance(input.type, GpuArrayType):
+                continue
+
+            if (len(input.clients) == 1 and
+                (input.clients[0][0] == 'output' or
+                 input.clients[0][0].op == gpu_from_host)):
+                continue
+
+            try:
+                new_input = host_from_gpu(gpu_from_host(input))
+                fgraph.replace_validate(input, new_input,
+                                        "InputToGpuOptimizer")
+            except TypeError, e:
+                # This could fail if the inputs are not TensorTypes
+                pass
+
+gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
+                    0, 'fast_run', 'fast_compile', 'merge')
+
+@local_optimizer([])
+def local_cut_gpu_host_gpu(node):
+    if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
+        return [node.inputs[0].owner.inputs[0]]
+    if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
+        return [node.inputs[0].owner.inputs[0]]
+    return False
+gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
+                        'fast_run', 'inplace', 'gpuarray')
+gpu_cut_copies.register('cut_gpua_constant_transfers',
+                        tensor.opt.constant_folding,
+                        'fast_run', 'gpuarray')
+optdb['canonicalize'].register('local_cut_gpua_host_gpua',
+                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
+
+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpualloc(node):
+    replace = False
+    if node.op == tensor.alloc:
+        if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
+            replace = True
+        elif all([c != 'output' and c.op == gpu_from_host
+                  for c, idx in node.outputs[0].clients]):
+            replace = True
+        elif all([c != 'output' and c.op == tensor.join and
+                  all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
+                       for i in c.inputs[1:]])
+                  for c, idx in node.outputs[0].clients]):
+            replace = True
+    if replace:
+        val = node.inputs[0]
+        shp = node.inputs[1:]
+        old_out = node.outputs[0]
+        val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
+        new_out = host_from_gpu(gpu_alloc(val, *shp))
+        if new_out.type != old_out.type:
+            assert new_out.type.ndim == old_out.type.ndim
+            assert new_out.type.dtype == old_out.type.dtype
+            for b_old, b_new in zip(old_out.type.broadcastable,
+                                    new_out.type.broadcastable):
+                assert b_new or (not b_old)
+            new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
+
+        return [new_out]
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_elemwise(node):
+    do_replace = False
+    gpu_out = False
+    # check for gpu_from_host(Elemwise)) and extract the Elemwise node
+    if node.op == gpu_from_host:
+        host_i, = node.inputs
+        if (host_i.owner and
+            isinstance(host_i.owner.op, tensor.Elemwise) and
+            len(host_i.clients) == 1):
+            node = host_i.owner
+            do_replace = True
+            gpu_out = True
+    # check for elemwise(..., host_from_gpu, ...)
+    if isinstance(node.op, tensor.Elemwise):
+        if numpy.any([i.owner and
+                      i.owner.op == host_from_gpu
+                      for i in node.inputs]):
+                do_replace = True
+    if numpy.all([_is_scalar(i)
+                  for i in node.inputs]):
+            do_replace = False
+
+    if do_replace:
+        new_op = GpuElemwise(node.op.scalar_op)
+        gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
+        if gpu_out:
+            return [gpu_elemwise]
+        else:
+            return [host_from_gpu(gpu_elemwise)]
+    else:
+        return False
--- a/theano/sandbox/gpuarray/tests/__init__.py
+++ b/theano/sandbox/gpuarray/tests/__init__.py
--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
+import unittest
+from itertools import izip
+from copy import copy, deepcopy
+
+import numpy
+import theano
+import theano.tensor as T
+from theano.compile import DeepCopyOp
+from theano.tensor.tests.test_basic import safe_make_node
+from theano.tests.unittest_tools import SkipTest
+from numpy.testing.noseclasses import KnownFailureTest
+
+import theano.sandbox.gpuarray
+
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
+    if not cuda_ndarray.use.device_number:
+        cuda_ndarray.use('gpu')
+    theano.sandbox.gpuarray.init_dev('cuda')
+
+if not theano.sandbox.gpuarray.pygpu_activated:
+    raise SkipTest("pygpu disabled")
+
+from theano.sandbox.gpuarray.type import (GpuArrayType,
+                                          gpuarray_shared_constructor)
+from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
+                                               gpu_alloc, gpu_from_cuda,
+                                               cuda_from_gpu)
+
+from theano.tests import unittest_tools as utt
+utt.seed_rng()
+rng = numpy.random.RandomState(seed=utt.fetch_seed())
+
+from pygpu import gpuarray
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray'\
+)
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+
+
+def may_fail(msg, EClass):
+    """Mark a test that requires very specific conditions to work to
+       mask a specific exception class."""
+    def test_decorator(f):
+        def wrapper():
+            try:
+                f()
+            except Exception, e:
+                if isinstance(e, EClass):
+                    raise KnownFailureTest(msg, e)
+                raise
+        wrapper.__name__ = f.__name__
+        return wrapper
+    return test_decorator
+
+def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
+                 on_unused_input='raise', name=None):
+    if mode is None:
+        mode = mode_with_gpu
+    return theano.function(inputs, outputs, mode=mode,
+                           allow_input_downcast=allow_input_downcast,
+                           accept_inplace=True,
+                           on_unused_input=on_unused_input, name=name)
+
+
+def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
+    from theano.tensor.sharedvar import tensor_constructor, scalar_constructor
+    for c in (gpuarray_shared_constructor, tensor_constructor,
+              scalar_constructor):
+        try:
+            return c(value, name=name, strict=strict,
+                     allow_downcast=allow_downcast, **kwargs)
+        except TypeError:
+            continue
+
+def rand_gpuarray(*shape, **kwargs):
+    r = rng.rand(*shape) * 2 - 1
+    dtype = kwargs.pop('dtype', theano.config.floatX)
+    if len(kwargs) != 0:
+        raise TypeError('Unexpected argument %s', kwargs.keys()[0])
+    return gpuarray.array(r, dtype=dtype)
+
+
+def makeTester(name, op, expected, good=None, bad_build=None, checks=None,
+               bad_runtime=None, mode=None, skip=False, eps=1e-10):
+    if good is None:
+        good = {}
+    if bad_build is None:
+        bad_build = {}
+    if bad_runtime is None:
+        bad_runtime = {}
+    if checks is None:
+        checks = {}
+
+    _op = op
+    _expected = expected
+    _good = good
+    _bad_build = bad_build
+    _bad_runtime = bad_runtime
+    _skip = skip
+    _checks = checks
+
+    class Checker(unittest.TestCase):
+        op = staticmethod(_op)
+        expected = staticmethod(_expected)
+        good = _good
+        bad_build = _bad_build
+        bad_runtime = _bad_runtime
+        skip = _skip
+        checks = _checks
+
+        def setUp(self):
+            eval(self.__class__.__module__ + '.' + self.__class__.__name__)
+
+        def test_good(self):
+            if skip:
+                raise SkipTest(skip)
+
+            for testname, inputs in good.items():
+                inputs = [copy(input) for input in inputs]
+                inputrs = [fake_shared(input) for input in inputs]
+
+                try:
+                    node = safe_make_node(self.op, *inputrs)
+                except Exception, exc:
+                    err_msg = ("Test %s::%s: Error occured while making "
+                               "a node with inputs %s") % (self.op, testname,
+                                                           inputs)
+                    exc.args += (err_msg,)
+                    raise
+
+                try:
+                    f = inplace_func([], node.outputs, mode=mode,
+                                     name='test_good')
+                except Exception, exc:
+                    err_msg = ("Test %s::%s: Error occured while trying to "
+                               "make a Function") % (self.op, testname)
+                    exc.args += (err_msg,)
+                    raise
+
+                if isinstance(self.expected, dict) and \
+                        testname in self.expected:
+                    expecteds = self.expected[testname]
+                else:
+                    expecteds = self.expected(*inputs)
+
+                if not isinstance(expecteds, (list, tuple)):
+                    expecteds = (expecteds,)
+
+                try:
+                    variables = f()
+                except Exception, exc:
+                    err_msg = ("Test %s::%s: Error occured while calling "
+                               "the Function on the inputs %s") % (self.op,
+                                                                   testname,
+                                                                   inputs)
+                    exc.args += (err_msg,)
+                    raise
+
+                for i, (variable, expected) in \
+                        enumerate(izip(variables, expecteds)):
+                    if variable.dtype != expected.dtype or \
+                            variable.shape != expected.shape or \
+                            not GpuArrayType.values_eq_approx(variable,
+                                                             expected):
+                        self.fail(("Test %s::%s: Output %s gave the wrong "
+                                   "value. With inputs %s, expected %s "
+                                   "(dtype %s), got %s (dtype %s).") % (
+                                self.op, testname, i, inputs, expected,
+                                expected.dtype, variable, variable.dtype))
+
+                for description, check in self.checks.items():
+                    if not check(inputs, variables):
+                        self.fail(("Test %s::%s: Failed check: %s "
+                                   "(inputs were %s, ouputs were %s)") %
+                                  (self.op, testname, description,
+                                   inputs, variables))
+
+        def test_bad_build(self):
+            if skip:
+                raise SkipTest(skip)
+            for testname, inputs in self.bad_build.items():
+                inputs = [copy(input) for input in inputs]
+                inputrs = [fake_shared(input) for input in inputs]
+                self.assertRaises(Exception, safe_make_node, self.op, *inputrs)
+
+        def test_bad_runtime(self):
+            if skip:
+                raise SkipTest(skip)
+            for testname, inputs in self.bad_runtime.items():
+                inputrs = [fake_shared(input) for input in inputs]
+                try:
+                    node = safe_make_node(self.op, *inputrs)
+                except Exception, exc:
+                    err_msg = ("Test %s::%s: Error occured while trying to "
+                               "make a node with inputs %s") % (self.op,
+                                                                testname,
+                                                                inputs)
+                    exc.args += (err_msg,)
+                    raise
+
+                try:
+                    f = inplace_func([], node.outputs, mode=mode,
+                                     name="test_bad_runtime")
+                except Exception, exc:
+                    err_msg = ("Test %s::%s: Error occured while trying to "
+                               "make a Function") % (self.op, testname)
+                    exc.args += (err_msg,)
+                    raise
+
+                self.assertRaises(Exception, f, [])
+
+    Checker.__name__ = name
+    return Checker
+
+
+def test_transfer_cpu_gpu():
+    a = T.fmatrix('a')
+    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
+    
+    av = numpy.asarray(rng.rand(5, 4), dtype='float32')
+    gv = gpuarray.array(av)
+    
+    f = theano.function([a], gpu_from_host(a))
+    fv = f(av)
+    assert GpuArrayType.values_eq(fv, gv)
+
+    f = theano.function([g], host_from_gpu(g))
+    fv = f(gv)
+    assert numpy.all(fv == av)
+
+
+def test_transfer_strided():
+    # This is just to ensure that it works in theano
+    # compyte has a much more comprehensive suit of tests to ensure correctness
+    a = T.fmatrix('a')
+    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
+
+    av = numpy.asarray(rng.rand(5, 8), dtype='float32')
+    gv = gpuarray.array(av)
+
+    av = av[:,::2]
+    gv = gv[:,::2]
+
+    f = theano.function([a], gpu_from_host(a))
+    fv = f(av)
+    assert GpuArrayType.values_eq(fv, gv)
+
+    f = theano.function([g], host_from_gpu(g))
+    fv = f(gv)
+    assert numpy.all(fv == av)
+
+
+@may_fail("Op fails if both contexts are not the same and it's rare "
+          "that the tests will be run this way", ValueError)
+def test_transfer_cuda_gpu():
+    import theano.sandbox.cuda as cuda_ndarray
+    if cuda_ndarray.cuda_available == False:
+        raise SkipTest("Can't test interaction with cuda if cuda not present")
+    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
+    c = cuda_ndarray.CudaNdarrayType((False, False))('c')
+
+    av = theano._asarray(rng.rand(5, 4), dtype='float32')
+    gv = gpuarray.array(av)
+    cv = cuda_ndarray.CudaNdarray(av)
+    gvs = gv[:,::-2]
+    cvs = cv[:,::-2]
+
+    f = theano.function([c], gpu_from_cuda(c))
+    fv = f(cv)
+    assert GpuArrayType.values_eq_approx(fv, gv)
+
+    fvs = f(cvs)
+    assert GpuArrayType.values_eq_approx(fvs, gvs)
+
+    f = theano.function([g], cuda_from_gpu(g))
+    fv = f(gv)
+    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
+
+    fvs = f(gvs)
+    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
+
+
+def gpu_alloc_expected(x, *shp):
+    g = gpuarray.empty(shp, dtype=x.dtype)
+    g[:] = x
+    return g
+
+GpuAllocTester = makeTester(
+    name="GpuAllocTester",
+    op=gpu_alloc,
+    expected=gpu_alloc_expected,
+    good=dict(
+        correct01=(rand_gpuarray(), numpy.int32(7)),
+        correct01_bcast=(rand_gpuarray(1), numpy.int32(7)),
+        correct02=(rand_gpuarray(), numpy.int32(4), numpy.int32(7)),
+        correct12=(rand_gpuarray(7), numpy.int32(4), numpy.int32(7)),
+        correct13=(rand_gpuarray(7), numpy.int32(2), numpy.int32(4),
+                   numpy.int32(7)),
+        correct23=(rand_gpuarray(4, 7), numpy.int32(2), numpy.int32(4),
+                   numpy.int32(7))
+        ),
+    bad_runtime=dict(
+        bad_shape12=(rand_gpuarray(7), numpy.int32(7), numpy.int32(5)),
+        )
+)
+
+def test_deep_copy():
+    a = rand_gpuarray(20, dtype='float32')
+    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+
+    f = theano.function([g], g)
+
+    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
+
+    res = f(a)
+
+    assert GpuArrayType.values_eq(res, a)
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
+import numpy
+
+import theano
+from theano import Type, Variable, Constant, tensor, config, scalar
+from theano.compile import SharedVariable
+
+# Make sure this is importable even if pygpu is absent
+# (it will not work though)
+try:
+    import pygpu
+    from pygpu import gpuarray
+    from pygpu.elemwise import compare, elemwise2
+except ImportError:
+    pass
+
+
+class GpuArrayType(Type):
+    def __init__(self, dtype, broadcastable, name=None):
+        # In case this was not provided and no global value is available
+        self.dtype = str(dtype)
+        self.broadcastable = tuple(bool(b) for b in broadcastable)
+        self.ndim = len(self.broadcastable)
+        self.name = name
+        try:
+            self.typecode = gpuarray.dtype_to_typecode(self.dtype)
+        except gpuarray.GpuArrayException:
+            raise TypeError("Unsupported dtype for %s: %s" %
+                            (self.__class__.__name__, self.dtype))
+    
+    def filter(self, data, strict=False, allow_downcast=None):
+        if strict:
+            if not isinstance(data, gpuarray.GpuArray):
+                raise TypeError("%s expected a GpuArray object." % self,
+                                data, type(data))
+            if self.typecode != data.typecode:
+                raise TypeError("%s expected typecode %d (dtype %s), "
+                                "got %d (dtype %s)." %
+                                (self, self.typecode, self.dtype,
+                                 data.typecode, str(data.dtype)))
+            # fallthrough to ndim check
+        elif allow_downcast:
+            data = gpuarray.array(data, dtype=self.typecode, copy=False,
+                                  ndmin=len(self.broadcastable))
+        else:
+            up_dtype = scalar.upcast(self.dtype, data.dtype)
+            if up_dtype == self.dtype:
+                data = gpuarray.array(data, dtype=self.typecode, copy=False)
+            else:
+                raise TypeError("%s cannot store a value of dtype %s "
+                                "without risking loss of precision." %
+                                (self, data.dtype))
+
+        if self.ndim != data.ndim:
+            raise TypeError("Wrong number of dimensions: expected %s, "
+                            "got %s with shape %s." % (self.ndim, data.ndim,
+                                                       data.shape), data)
+        shp = data.shape
+        for i, b in enumerate(self.broadcastable):
+            if b and shp[i] != 1:
+                raise TypeError("Non-unit value on shape on a broadcastable"
+                                " dimension.", shp, self.broadcastable)
+        return data
+
+    def filter_variable(self, other):
+        if hasattr(other, '_as_GpuArrayVariable'):
+            other = other._as_GpuArrayVariable()
+
+        if not isinstance(other, Variable):
+            other = self.Constant(type=self, data=other)
+
+        if other.type == self:
+            return other
+
+        if not isinstance(other.type, tensor.TensorType):
+            raise TypeError('Incompatible type', (self, other.type))
+        if (other.type.dtype != self.dtype):
+            raise TypeError('Incompatible dtype', (self.dtype,
+                                                   other.type.dtype))
+        if other.type.ndim != self.ndim:
+            raise TypeError('Incompatible number of dimensions.'
+                            ' Expected %d, got %d.' % (self.ndim, other.ndim))
+        if other.type.broadcastable != self.broadcastable:
+            raise TypeError('Incompatible broadcastable dimensions.'
+                            ' Expected %s, got %s.' %
+                            (str(other.type.broadcastable),
+                             str(self.broadcastable)))
+
+        return theano.sandbox.gpuarray.basic_ops.gpu_from_host(other)
+
+    @staticmethod
+    def values_eq(a, b):
+        if a.shape != b.shape:
+            return False
+        if a.typecode != b.typecode:
+            return False
+        return numpy.asarray(compare(a, '==', b)).all()
+
+    @staticmethod
+    def values_eq_approx(a, b):
+        if a.shape != b.shape or a.dtype != b.dtype:
+            return False
+        if 'int' in str(a.dtype):
+            return GpuArrayType.values_eq(a, b)
+        else:
+            res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
+                            op_tmpl="res[i] = ((%(a)s - %(b)s) <" \
+                                "(1e-8 + 1e-5 * fabs(%(b)s)))")
+            return numpy.asarray(res).all()
+
+    def value_zeros(self, shape):
+        return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
+
+    def make_variable(self, name=None):
+        return self.Variable(self, name=name)
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.typecode == other.typecode and
+                self.broadcastable == other.broadcastable)
+
+    def __hash__(self):
+        return (hash(self.typecode) ^ hash(self.broadcastable))
+
+    def __str__(self):
+        return "GpuArray<%s>" % (self.dtype,)
+
+    def get_shape_info(self, obj):
+        return obj.shape
+
+    def get_size(self, shape_info):
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:
+            return numpy.dtype(self.dtype).itemsize
+
+    def c_declare(self, name, sub):
+        return "GpuArrayObject *%s;" % (name,)
+
+    def c_init(self, name, sub):
+        return "%s = NULL;" % (name,)
+
+    def c_extract(self, name, sub):
+        # TODO I don't check broadcast stuff for now.
+        return """
+        %(name)s = NULL;
+        if (py_%(name)s == Py_None) {
+            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
+            %(fail)s
+        }
+        /* First check if we are the base type exactly (the most common case),
+           then do the full subclass check if needed. */
+        if (py_%(name)s->ob_type != &GpuArrayType &&
+            !PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) {
+            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
+            %(fail)s
+        }
+        %(name)s = (GpuArrayObject *)py_%(name)s;
+        Py_INCREF(%(name)s);
+        """ % {'name': name, 'fail': sub['fail']}
+
+    def c_cleanup(self, name, sub):
+        return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name }
+
+    def c_sync(self, name, sub):
+        return """
+        if (!%(name)s) {
+            Py_XDECREF(py_%(name)s);
+            Py_INCREF(Py_None);
+            py_%(name)s = Py_None;
+        } else if ((void *)py_%(name)s != (void *)%(name)s) {
+            Py_XDECREF(py_%(name)s);
+            py_%(name)s = (PyObject *)%(name)s;
+            Py_INCREF(py_%(name)s);
+        }
+        """ % {'name': name}
+
+    def c_init_code(self):
+        # We don't actually need the numpy API except in
+        # HostFromGpu and GpuFromHost and those case will be covered
+        # by the TensorType parameter
+        return ['import_pygpu__gpuarray();']
+
+    def c_headers(self):
+        # We need arrayobject for the PyArrayDescr struct def
+        # (even if we just use a pointer to it in a function def)
+        return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
+                '<numpy/arrayobject.h>', '<gpuarray_api.h>']
+
+    def c_header_dirs(self):
+        return [pygpu.get_include(), numpy.get_include()]
+
+    def c_libraries(self):
+        return ['compyte']
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+class _operators(tensor.basic._tensor_py_operators):
+    def _as_TensorVariable(self):
+        from basic_ops import host_from_gpu
+        return host_from_gpu(self)
+
+    def _as_GpuArrayVariable(self):
+        return self
+
+    dtype = property(lambda s: s.type.dtype)
+    broadcastable = property(lambda s: s.type.broadcastable)
+    ndim = property(lambda s: s.type.ndim)
+
+
+class GpuArrayVariable(_operators, Variable):
+    pass
+
+
+GpuArrayType.Variable = GpuArrayVariable
+
+
+class GpuArraySignature(tensor.basic.TensorConstantSignature):
+    pass  # might do something better if we can run the sum on the
+          # GPU, but for now this will suffice.
+
+
+class GpuArrayConstant(_operators, Constant):
+    def signature(self):
+        return GpuArraySignature((self.type, numpy.asarray(self.data)))
+
+    def __str__(self):
+        if self.name is not None:
+            return self.name
+        return "GpuArrayConstant{%s}" % numpy.asarray(self.data)
+
+
+GpuArrayType.Constant = GpuArrayConstant
+
+
+class GpuArraySharedVariable(_operators, SharedVariable):
+    def get_value(self, borrow=False, return_internal_type=False):
+        if return_internal_type:
+            if borrow:
+                return self.container.value
+            else:
+                return self.container.value.copy()
+        else:
+            return numpy.asarray(self.container.value)
+
+    def set_value(self, value, borrow=False):
+        self.container.value = pygpu.gpuarray.array(value, copy=(not borrow))
+
+    def __getitem__(self, *args):
+        return _operators.__getitem__(self, *args)
+
+
+GpuArrayType.SharedVariable = GpuArraySharedVariable
+
+
+def gpuarray_shared_constructor(value, name=None, strict=False,
+                                allow_downcast=None, borrow=False,
+                                broadcastable=None):
+    """SharedVariable constructor for GpuArrayType"""
+    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
+        raise TypeError('ndarray or GpuArray required')
+
+    if broadcastable is None:
+        broadcastable = (False,) * value.ndim
+    type = GpuArrayType(value.dtype, broadcastable)
+    deviceval = pygpu.gpuarray.array(value, copy=(not borrow))
+    return GpuArraySharedVariable(type=type, value=deviceval, name=name,
+                                  strict=strict)
+
+theano.compile.register_view_op_c_code(GpuArrayType, """
+    Py_XDECREF(%(oname)s);
+    %(oname)s = %(iname)s;
+    Py_XINCREF(%(oname)s);
+""", version=(0,))
+
+theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
+    Py_XDECREF(%(oname)s);
+    %(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
+    if (!%(oname)s) { %(fail)s }
+    int err;
+    err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
+    if (err != GA_NO_ERROR) {
+        PyErr_SetString(PyExc_RuntimeError, "Error during copy");
+        %(fail)s
+    }
+""", version=(1,))