added files that i forgot to add before

602e87d8 · James Bergstra · fbbeb192 · 602e87d8 · 602e87d8 · 602e87d8
--- a/Makefile
+++ b/Makefile
+type_support.so : type_support.cu
+	nvcc  -O3 -shared -I$(HOME)/cvs/lgcm/cuda_ndarray -I$(CUDA_ROOT)/include -I/usr/include/python2.6 -o type_support.so -Xcompiler -fPIC type_support.cu -L$(CUDA_ROOT)/lib $(HOME)/cvs/lgcm/cuda_ndarray/cuda_ndarray.so
+clean : 
+	rm type_support.so
--- a/nvcc_compiler.py
+++ b/nvcc_compiler.py
+import sys, os, subprocess, logging
+from theano.gof.cmodule import (std_libs, std_lib_dirs, std_include_dirs, dlimport,
+    get_lib_extension)
+_logger=logging.getLogger("theano_cuda_ndarray.nvcc_compiler")
+_logger.setLevel(logging.WARN)
+def error(*args):
+    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
+    _logger.error("ERROR: "+' '.join(str(a) for a in args))
+def warning(*args):
+    #sys.stderr.write('WARNING:'+ ' '.join(str(a) for a in args)+'\n')
+    _logger.warning("WARNING: "+' '.join(str(a) for a in args))
+def info(*args):
+    #sys.stderr.write('INFO:'+ ' '.join(str(a) for a in args)+'\n')
+    _logger.info("INFO: "+' '.join(str(a) for a in args))
+def debug(*args):
+    #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
+    _logger.debug("DEBUG: "+' '.join(str(a) for a in args))
+def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[], lib_dirs=[], libs=[],
+        preargs=[]):
+    """
+    :param module_name: string (this has been embedded in the src_code
+    :param src_code: a complete c or c++ source listing for the module
+    :param location: a pre-existing filesystem directory where the cpp file and .so will be written
+    :param include_dirs: a list of include directory names (each gets prefixed with -I)
+    :param lib_dirs: a list of library search path directory names (each gets prefixed with -L)
+    :param libs: a list of libraries to link with (each gets prefixed with -l)
+    :param preargs: a list of extra compiler arguments
+    :returns: dynamically-imported python module of the compiled code.
+    """
+    preargs= [] if preargs is None else list(preargs)
+    preargs.append('-fPIC')
+    no_opt = False
+    include_dirs = std_include_dirs() + include_dirs
+    libs = std_libs() + ['cudart'] + libs
+    lib_dirs = std_lib_dirs() + [os.path.join(os.getenv('CUDA_ROOT'), 'lib')] + lib_dirs
+    cppfilename = os.path.join(location, 'mod.cu')
+    cppfile = file(cppfilename, 'w')
+    debug('Writing module C++ code to', cppfilename)
+    ofiles = []
+    rval = None
+    cppfile.write(src_code)
+    cppfile.close()
+    lib_filename = os.path.join(location, '%s.%s' %
+            (module_name, get_lib_extension()))
+    debug('Generating shared lib', lib_filename)
+    cmd = ['nvcc', '-shared', '-g'] + [pa for pa in preargs if pa.startswith('-O')]
+    cmd.extend(['-Xcompiler', ','.join(pa for pa in preargs if not pa.startswith('-O'))])
+    cmd.extend('-I%s'%idir for idir in include_dirs)
+    cmd.extend(['-o',lib_filename]) 
+    cmd.append(cppfilename)
+    cmd.extend(['-L%s'%ldir for ldir in lib_dirs])
+    cmd.extend(['-l%s'%l for l in libs])
+    debug('Running cmd', ' '.join(cmd))
+    p = subprocess.Popen(cmd, stderr=subprocess.PIPE)
+    stderr = p.communicate()[1] 
+    if p.returncode: 
+        # filter the output from the compiler
+        for l in stderr.split('\n'):
+            if not l:
+                continue
+            # filter out the annoying declaration warnings
+            try:
+                if l[l.index(':'):].startswith(': warning: variable'):
+                    continue
+                if l[l.index(':'):].startswith(': warning: label'):
+                    continue
+            except: 
+                pass
+            print l
+        print '==============================='
+        for i, l in enumerate(src_code.split('\n')):
+            print i+1, l
+        raise Exception('nvcc return status', p.returncode)
+    #touch the __init__ file
+    file(os.path.join(location, "__init__.py"),'w').close()      
+    return dlimport(lib_filename)
--- a/opt.py
+++ b/opt.py
+from theano import tensor, gof
+from theano import tensor, scalar
+from .basic_ops import *
+@gof.local_optimizer([GpuFromHost(), None])
+def local_gpu_host_gpu(node):
+    if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
+        return False
+    return [node.inputs[0].owner.inputs[0]]
+tensor.opt.register_specialize(local_gpu_host_gpu, 'gpu')
+@gof.local_optimizer([HostFromGpu(), None])
+def local_host_gpu_host(node):
+    if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
+        return False
+    return [node.inputs[0].owner.inputs[0]]
+tensor.opt.register_specialize(local_host_gpu_host, 'gpu')
+@gof.local_optimizer([])
+def local_gpu_elemwise(node):
+    if isinstance(node.op, tensor.Elemwise):
+        if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs):
+            # move the add to a GpuAdd
+            new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
+            return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]
+    return False
+tensor.opt.register_specialize(local_gpu_elemwise, 'gpu')
+@gof.local_optimizer([])
+def local_gpu_dimshuffle(node):
+    if isinstance(node.op, tensor.DimShuffle):
+        input, = node.inputs
+        if input.owner and isinstance(input.owner.op, HostFromGpu):
+            # move the add to a GpuAdd
+            new_op = GpuDimShuffle(node.op.input_broadcastable, 
+                    node.op.new_order)
+            if node.op.inplace:
+                return [host_from_gpu(new_op(gpu_from_host(input)))]
+            else:
+                return [host_from_gpu(new_op(gpu_from_host(tensor.tensor_copy(input))))]
+    return False
+tensor.opt.register_specialize(local_gpu_dimshuffle, 'gpu')
--- a/var.py
+++ b/var.py
+import numpy
+from theano import Op, Type, Apply, Variable, Constant
+from theano import tensor
+from theano.compile.sandbox.sharedvalue import shared, SharedVariable, shared_constructor
+from .type import CudaNdarrayType
+from .type_support import filter as type_support_filter
+from .basic_ops import HostFromGpu, GpuFromHost
+class _operators(tensor.basic._tensor_py_operators):
+    """Define a few properties and conversion methods for CudaNdarray Variables.
+    The default implementation of arithemetic operators is to build graphs of TensorType
+    variables. 
+    The optimization pass (specialization) will insert pure GPU implementations.
+    This approach relieves the Cuda-Ops of having to deal with input argument checking and
+    gradients.
+    """
+    def _as_TensorVariable(self):
+        return HostFromGpu()(self)
+    def _as_CudaNdarrayVariable(self):
+        return self
+    dtype = property(lambda s:'float32')
+    broadcastable = property(lambda s:s.type.broadcastable)
+    ndim = property(lambda s:s.type.ndim)
+class CudaNdarrayVariable(Variable, _operators):
+    pass
+CudaNdarrayType.Variable = CudaNdarrayVariable
+class CudaNdarrayConstant(Constant, _operators):
+    pass
+CudaNdarrayType.Constant = CudaNdarrayConstant
+class CudaNdarraySharedVariable(SharedVariable, _operators):
+    def __getvalue(self):
+        return numpy.asarray(self.container.value)
+    def __setvalue(self, value):
+        self.container.value = value #container does the filtering 
+    value = property(__getvalue, __setvalue)
+    def filter_update(self, other):
+        if hasattr(other, '_as_CudaNdarrayVariable'):
+            return other._as_CudaNdarrayVariable()
+        if isinstance(other.type, tensor.TensorType) and (other.type.dtype == self.dtype) and (other.broadcastable == self.broadcastable):
+            return GpuFromHost()(other)
+        else:
+            raise TypeError((other, other.type))
+CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
+def shared_constructor(value, name, strict=False):
+    """SharedVariable Constructor for TensorType"""
+    if strict:
+        _value = value
+    else:
+        _value = numpy.asarray(value, dtype='float32')
+    if not isinstance(_value, numpy.ndarray):
+        raise TypeError('ndarray required')
+    if _value.dtype.num != CudaNdarrayType.typenum:
+        raise TypeError('float32 ndarray required')
+    bcast = [0 for b in value.shape]
+    type = CudaNdarrayType(broadcastable=bcast)
+    return CudaNdarraySharedVariable(type=type, value=_value, name=name, strict=strict)
+def unset_shared_for_numpy():
+    raise NotImplementedError()
+def set_shared_for_numpy():
+    """
+    Set the gpu_tensor_constructor as the handler for ndarray
+    """
+    shared_constructor(shared_constructor)