Merge remote-tracking branch 'delallea/surban-master'

1da867d8 · Olivier Delalleau · 24b5cff9 · 18b95657 · 1da867d8 · 1da867d8
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ theano/version.py
 theano/version.py.out
 distribute-*.egg
 distribute-*.tar.gz
+Theano.suo
--- a/Theano.pyproj
+++ b/Theano.pyproj
--- a/Theano.sln
+++ b/Theano.sln
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "Theano", "Theano.pyproj", "{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/theano/gof/compiledir.py
+++ b/theano/gof/compiledir.py
@@ -4,6 +4,7 @@ import errno
 import os
 import platform
 import re
+import sys
 import theano
 from theano.configparser import config, AddConfigVar, ConfigParam, StrParam
@@ -14,7 +15,7 @@ def default_compiledirname():
        platform.platform(),
        platform.processor(),
        platform.python_version()])
-    platform_id = re.sub("[\(\)\s]+", "_", platform_id)
+    platform_id = re.sub("[\(\)\s,]+", "_", platform_id)
    return 'compiledir_' + platform_id
@@ -50,9 +51,19 @@ def filter_compiledir(path):
    return path
+# TODO Using the local user profile on Windows is currently disabled as it
+# is not documented yet, and may break some existing code. It will be enabled
+# in a future code update.
+if False and sys.platform == 'win32':
+    # On Windows we should not write temporary files to a directory 
+    # that is part of the roaming part of the user profile. Instead
+    # we use the local part of the user profile.
+    basecompiledir = os.path.join(os.environ['LOCALAPPDATA'], 'theano')
+else:
+    basecompiledir = os.path.join(config.home, '.theano')
 AddConfigVar('base_compiledir',
        "arch-independent cache directory for compiled modules",
-        StrParam(os.path.join(config.home, '.theano'), allow_override=False))
+        StrParam(basecompiledir, allow_override=False))
 AddConfigVar('compiledir',
        "arch-dependent cache directory for compiled modules",

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-import atexit, logging, os, stat, sys
+import atexit, logging, os, shutil, stat, sys
 from theano.compile import optdb
 from theano.gof.cmodule import get_lib_extension
 from theano.configparser import config, AddConfigVar, StrParam
@@ -122,7 +122,11 @@ if cuda_available:
    try:
        open(libcuda_ndarray_so).close()
    except IOError:
-        os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
+        if sys.platform == "win32":
+            # The Python `os` module does not support symlinks on win32.
+            shutil.copyfile(cuda_ndarray_so, libcuda_ndarray_so)
+        else:
+            os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
    try:
        gpu_init()

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -471,7 +471,10 @@ class GpuSum(Op):
           )
        {
            """ %locals()
-        print >> sio, "int new_dims[%(nd_out)s]; " % locals()
+        if nd_out > 0:
+            print >> sio, "int new_dims[%(nd_out)s]; " % locals()
+        else:
+            print >> sio, "int *new_dims=NULL; "
        j = 0
        for i in xrange(nd_in):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
+#define _CUDA_NDARRAY_C
 #include <Python.h>
 #include <structmember.h>
@@ -3420,6 +3422,292 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
    return NULL;
 }
+int
+cnda_structure_size(int nd)
+{
+    // dim0, dim1, ...
+    // str0, str1, ...
+    // log2(dim0), log2(dim1), ...
+    return nd + nd + nd;
+}
+const int *
+CudaNdarray_HOST_DIMS(const CudaNdarray * self)
+{
+    return self->host_structure;
+}
+const int *
+CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
+{
+    return self->host_structure + self->nd;
+}
+const int *
+CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
+{
+    return self->host_structure + 2*self->nd;
+}
+void
+cnda_mark_dev_structure_dirty(CudaNdarray * self)
+{
+    self->dev_structure_fresh = 0;
+}
+int
+CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
+{
+    int verbose = 1;
+    if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
+    {
+        if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
+        return 0;
+    }
+    if (cnda1->nd != cnda2->nd)
+    {
+        if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
+        return 0;
+    }
+    for (int i=0; i < 2*cnda1->nd; i++)
+    {
+        if (cnda1->host_structure[i] != cnda2->host_structure[i])
+        {
+            if(verbose)
+                fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
+            return 0;
+        }
+    }
+    if (!ignoreBase && cnda1->base != cnda2->base)
+    {
+        if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
+        return 0;
+    }
+    else if (cnda1->data_allocated != cnda2->data_allocated)
+    {
+        if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
+        return 0;
+    }
+    else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
+    {
+        if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
+        // no need to check devdata if data is not allocated
+        return 0;
+    }
+    return 1;
+}
+int
+CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
+{
+    return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
+}
+void
+CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
+{
+    if ((idx >= self->nd) || (idx < 0) || (d < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
+    }
+    if (d != self->host_structure[idx])
+    {
+        self->host_structure[idx] = d;
+        int log2d = (int)log2((double)d);
+        self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
+void
+CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
+{
+    if ((idx >= self->nd) || (idx < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
+    }
+    if (s != CudaNdarray_HOST_STRIDES(self)[idx])
+    {
+        self->host_structure[idx+self->nd] = s;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
+int
+cnda_copy_structure_to_device(CudaNdarray * self)
+{
+    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
+    CNDA_THREAD_SYNC;
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+    {
+        PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
+        return -1;
+    }
+    self->dev_structure_fresh = 1;
+    return 0;
+}
+const int *
+CudaNdarray_DEV_DIMS(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure;
+}
+const int *
+CudaNdarray_DEV_STRIDES(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure + self->nd;
+}
+const int *
+CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure + 2*self->nd;
+}
+float *
+CudaNdarray_DEV_DATA(const CudaNdarray * self)
+{
+    return self->devdata;
+}
+/**
+ * Return the number of elements in the ndarray (product of the dimensions)
+ */
+int
+CudaNdarray_SIZE(const CudaNdarray *self)
+{
+    if (self->nd == -1) return 0;
+    int size = 1;
+    for (int i = 0; i < self->nd; ++i)
+    {
+        size *= CudaNdarray_HOST_DIMS(self)[i];
+    }
+    return size;
+}
+PyObject *
+CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
+{
+    return PyInt_FromLong(CudaNdarray_SIZE(self));
+}
+int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
+{
+    if (nd != self->nd)
+    {
+        if (self->dev_structure)
+        {
+            if (device_free(self->dev_structure))
+            {
+                return -1;
+            }
+            self->dev_structure = NULL;
+        }
+        if (self->host_structure)
+        {
+            free(self->host_structure);
+            self->host_structure = NULL;
+            self->nd = -1;
+        }
+        if (nd == -1) return 0;
+        self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
+        if (NULL == self->host_structure)
+        {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
+            return -1;
+        }
+        //initialize all dimensions and strides to 0
+        for (int i = 0; i < cnda_structure_size(nd); ++i)
+        {
+            self->host_structure[i] = 0;
+        }
+        int struct_size = cnda_structure_size(nd);
+        if (struct_size)
+        {
+            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
+            if (NULL == self->dev_structure)
+            {
+                free(self->host_structure);
+                self->host_structure = NULL;
+                self->dev_structure = NULL;
+                return -1;
+            }
+        }
+        self->nd = nd;
+        self->dev_structure_fresh = 0;
+    }
+    return 0;
+}
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
+{
+    return CudaNdarray_set_device_data(self, data, (PyObject *) base);
+}
+PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
+{
+    return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
+}
+void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
+{
+    fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
+            self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
+    fprintf(fd, "\tHOST_DIMS:      ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
+    }
+    fprintf(fd, "\n\tHOST_STRIDES: ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
+    }
+    int data=0;
+    fprintf(fd, "\n\tDEV_DIMS:      ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        cublasGetVector(1, sizeof(int),
+                        self->dev_structure+i, 1,
+                        &data, 1);
+        fprintf(fd, "%i\t", data);
+    }
+    fprintf(fd, "\n\tDEV_STRIDES: ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        cublasGetVector(1, sizeof(int),
+                        self->dev_structure + self->nd+i, 1,
+                        &data, 1);
+        fprintf(fd, "%i \t", data);
+    }
+    fprintf(fd, "\n");
+}
 /*
  Local Variables:
  mode:c++

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -534,33 +534,44 @@ class NaiveAlgo(object):
    # collapse dimension that are broadcast in all inputs.
    # need to be done before contiguous collapse as it will break it.
    # do the dimensions and the strides
+        if nd > 0:
+            print >> sio, "int local_dims[%(nd)s];" % locals()
+        else:
+            print >> sio, "int *local_dims=NULL;"
+        if nb_inputs > 0 and nd > 0:
+            print >> sio, """
+            int local_str[%(nb_inputs)s][%(nd)s];
+            int local_ostr[%(nb_inputs)s][%(nd)s];
+            """ % locals()
+        else:
+            print >> sio, """
+            int local_str[1][1];
+            int local_ostr[1][1];
+            """
        print >> sio, """
-        int local_dims[%(nd)s];
-        int local_str[%(nb_inputs)s][%(nd)s];
-        int local_ostr[%(nb_inputs)s][%(nd)s];
        int nd_collapse = %(nd)s;
        for(int i=0;i<%(nd)s;i++){//init new dim
          local_dims[i]=dims[i];
        }
-        """%locals()
+        """ % locals()
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
            }
-            """%locals()
+            """ % locals()
        for ipos in xrange(len(node.outputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
            }
-            """%locals()
+            """ % locals()
        if self.verbose>2:
            print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
            print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
            print >> sio, 'std::cerr << "local_dims";'
            for d in xrange(nd):
-                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
+                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
            print >> sio, 'std::cerr << "\\n";'
            for ipos in xrange(len(node.inputs)):
@@ -611,11 +622,18 @@ class NaiveAlgo(object):
    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
    # this is a good idea because we make less index calculation in the gpu.
-        print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};"
+        if nd > 0:
+            print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};"
+        else:
+            print >> sio, "int *nd_collapse_ = NULL;"
        for ipos in xrange(len(node.inputs)):
            if not _logical_scalar(node.inputs[ipos]):
-                print >> sio, """
+                if nd > 0:
-                    int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};"
+                    print >> sio, """
+                        int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};"
+                else:
+                    print >> sio, """
+                        int *nd_collapse_%(ipos)s = NULL;"""%locals()
                print >> sio, """
 can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
 for(int i=0;i<nd_collapse;i++){
@@ -839,9 +857,14 @@ nd_collapse_[i]=0;
        //std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
            """ %locals()
-        print >> sio, """
+        if nd > 0:
-        int dims[%(nd)s] = {%(initial_dims)s};
+            print >> sio, """
-        """ %locals()
+            int dims[%(nd)s] = {%(initial_dims)s};
+            """ % locals()
+        else:
+            print >> sio, """
+            int *dims = NULL;
+            """
        #check that all inputs have valid dimensions
        emitted_inames = {}
@@ -851,9 +874,14 @@ nd_collapse_[i]=0;
                continue
            broadcasts = ', '.join(map(str,map(int,node.inputs[id].broadcastable)))
            nd = node.inputs[id].ndim
-            print >> sio, """
+            if nd > 0:
-        int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
+                print >> sio, """
-""" %locals()
+                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
+                """ % locals()
+            else:
+                print >> sio, """
+                int *broadcasts_%(iname)s = NULL;
+                """ % locals()
            emitted_inames[iname] = node.inputs[id]
        #check that all inputs have valid dimensions
        emitted_inames = {}

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -164,7 +164,12 @@ def nvcc_module_compile_str(
    if config.nvcc.compiler_bindir:
        cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
-    if sys.platform!='win32':
+    if sys.platform == 'win32':
+        # add flags for Microsoft compiler to create .pdb files
+        preargs2.append('/Zi')
+        cmd.extend(['-Xlinker', '/DEBUG'])
+    if sys.platform != 'win32':
        if local_bitwidth() == 64:
            cmd.append('-m64')
            preargs2.append('-m64')
@@ -180,8 +185,10 @@ def nvcc_module_compile_str(
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
            rpaths.append(os.path.join(config.cuda.root,'lib64'))
-    for rpath in rpaths:
+    if sys.platform != 'win32':
-        cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
+        # the -rpath option is not understood by the Microsoft linker
+        for rpath in rpaths:
+            cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
    cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
    cmd.extend('-I%s'%idir for idir in include_dirs)
    cmd.extend(['-o',lib_filename])