make Theano gpu compilation go farther on Windows.

f359b692 · Frédéric Bastien · 69f1c9a3 · f359b692 · f359b692 · f359b692
--- a/doc/install.txt
+++ b/doc/install.txt
@@ -292,12 +292,17 @@ Windows V1(bigger install, but simpler instruction + try instruction for gpu)
      [blas]
      ldflags =

-  Space or non ascii caracter are not always supported. If that is your case,
-  Set the environment variable 'THEANO_FLAGS' to the value 'blas.ldflags='
-
  This is enough to run Theano! It will use NumPy for dot products
  which, however, is pretty fast (see below).
+  
+  To test that theano read correctly the .theanorc or .theanorc.txt file,
+  in python run:
+  .. code-block:: bash
+      import theano
+      print theano.config.blas.ldflags

+  That should print the same content as what is in your config file.
+  
 - (Optional) If you want a faster and/or multithreaded BLAS library, you can
  compile GotoBLAS2. I did not try to compile ATLAS because I read that
  it is slower than Goto and very difficult to compile (especially for
@@ -342,22 +347,49 @@ Windows V1(bigger install, but simpler instruction + try instruction for gpu)

  Those are indication for 32 bits version of python, the one that come with pythonxy is 32 bits.

-  Install cuda driver(32 bits on 32 bits Windows, idem for 64 bits).
-  Install cuda toolkit 32 bits(even if you computer is 64 bits, must match the python installation version)
-  Install cuda sdk 32 bits
-  Test some pre-compiled example of the sdk
-  Download Visual Studio 2008 Express(free, VS2010 not supported by nvcc 3.1, VS2005, not available for download, but supported by nvcc, the non free version should work too)
-  Follow the instruction in the GettingStartedWindows.pdf file from cuda web site to compile cuda code with VS2008. If that don't work, you won't be able to compile gpu code with theano.
+  Space or non ascii caracter are not always supported in path. Python support 
+  them, so your configuration file path can contain them. 
+  nvcc(at least version 3.1) don't support them well. If your USERPROFILE 
+  directory contain those caractere, you must add in your configuration file:
+    .. code-block:: bash
+      [global]
+      base_compiledir=PATH_TO_A_DIRECTORY_WITHOUT_THOSE_CARACTERE
+
+  You also need to add in the configuration file those line:
+    .. code-block:: bash
+      [cuda]
+      nvccflags=-LC:\Python26\libs
+
+  Then
+  
+  1) Install cuda driver(32 bits on 32 bits Windows, idem for 64 bits).
+  
+  2) Install cuda toolkit 32 bits(even if you computer is 64 bits, 
+     must match the python installation version)
+  
+  3) Install cuda sdk 32 bits
+  
+  4) Test some pre-compiled example of the sdk
+
+  5) Download Visual Studio 2008 Express(free, VS2010 not supported by nvcc 3.1,
+     VS2005, not available for download, but supported by nvcc, the non free version should work too)
+
+  6) Follow the instruction in the GettingStartedWindows.pdf file from cuda web 
+     site to compile cuda code with VS2008. If that don't work, you won't be 
+     able to compile gpu code with theano.

-  Put into you PATH environment variable the directory where cl.exe is. In my case it is: C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin
+  7) Put into you PATH environment variable the directory where cl.exe is. 
+     In my case it is: C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin

-  Make sure the theano folder is in your PYTHONPATH environment variable.
+  8) Make sure the theano folder is in your PYTHONPATH environment variable.

-  Then in theano do: import theano.sandbox.cuda
+  9) Then in theano do: import theano.sandbox.cuda

-  That will print some error if their is an error to compile the first cuda file.
+     That will print some error if their is an error to compile the first cuda file.

-  Then run the theano cuda test file. In Windows command line (cmd.exe), run the program nosetests inside the theano repository. nosetests is installed by pythonxy.
+  10) Then run the theano cuda test file. In Windows command line (cmd.exe), 
+      run the program nosetests inside the theano repository. 
+      nosetests is installed by pythonxy.

 Windows V2(smaller install, but longer instruction)
 ---------------------------------------------------

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -49,7 +49,7 @@ def set_cuda_disabled():
                'working properly')

 #cuda_ndarray compile and import
-cuda_path = os.path.split(__file__)[0]
+cuda_path = os.path.abspath(os.path.split(__file__)[0])
 date = os.stat(os.path.join(cuda_path,'cuda_ndarray.cu'))[stat.ST_MTIME]
 date = max(date,os.stat(os.path.join(cuda_path,'cuda_ndarray.cuh'))[stat.ST_MTIME])
 date = max(date,os.stat(os.path.join(cuda_path,'conv_full_kernel.cu'))[stat.ST_MTIME])

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -13,6 +13,9 @@ AddConfigVar('nvcc.compiler_bindir',
        "if defined, nvcc compiler driver will seek g++ and gcc in this directory",
        StrParam(""))

+AddConfigVar('cuda.nvccflags',
+        "Extra compiler flags for nvcc",
+        StrParam(""))

 def error(*args):
    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
@@ -62,7 +65,18 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
    :param preargs: a list of extra compiler arguments

    :returns: dynamically-imported python module of the compiled code.
+    
+    :note 1: On Windows 7 with nvcc 3.1 we need to compile in the real directory
+             Otherwise nvcc never finish.
    """
+    
+    if sys.platform=="win32":
+        #remove some compilation args that cl.exe don't understand
+        #cl.exe is the compiler used by nvcc on Windows
+        for a in ["-Wno-write-strings","-Wno-unused-label",
+                  "-Wno-unused-variable", "-fno-math-errno"]:
+            if a in preargs:
+                preargs.remove(a)
    if preargs is None:
        preargs= []
    else: preargs = list(preargs)
@@ -70,8 +84,17 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
        preargs.append('-fPIC')
    no_opt = False
    cuda_root = config.cuda.root
-    include_dirs = std_include_dirs() + include_dirs + [os.path.split(__file__)[0]]
-    libs = std_libs() + ['cudart'] + libs
+
+    #The include dirs gived by the user should have precedence over
+    #the standards ones.
+    include_dirs = include_dirs + std_include_dirs()
+    if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs:
+        include_dirs.append(os.path.abspath(os.path.split(__file__)[0]))
+    
+    libs = std_libs() + libs
+    if 'cudart' not in libs:
+        libs.append('cudart')
+
    lib_dirs = std_lib_dirs() + lib_dirs
    if cuda_root:
        lib_dirs.append(os.path.join(cuda_root, 'lib'))
@@ -133,11 +156,13 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
            cmd.extend(['-Xlinker',','.join(['-rpath',os.path.join(config.cuda.root,'lib64')])])
+    nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag]
+    cmd.extend(nvccflags)
    cmd.extend('-I%s'%idir for idir in include_dirs)
-    cmd.extend(['-o',lib_filename]) 
-    cmd.append(cppfilename)
+    cmd.extend(['-o',lib_filename])
+    cmd.append(os.path.split(cppfilename)[-1])
    if module_name != 'cuda_ndarray':
-        cmd.append(os.path.join(os.path.split(cppfilename)[0],'..','cuda_ndarray','cuda_ndarray.so'))
+        cmd.append(os.path.join(os.path.split(cppfilename)[0],'..','cuda_ndarray','cuda_ndarray.'+get_lib_extension()))
    cmd.extend(['-L%s'%ldir for ldir in lib_dirs])
    cmd.extend(['-l%s'%l for l in libs])
    if sys.platform == 'darwin':
@@ -156,12 +181,16 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
                done = True
    
    #cmd.append("--ptxas-options=-v")  #uncomment this to see register and shared-mem requirements
-    #print >> sys.stderr, 'COMPILING W CMD', cmd
    debug('Running cmd', ' '.join(cmd))
+    orig_dir = os.getcwd()
+    try:
+        os.chdir(location)

-    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    nvcc_stdout, nvcc_stderr = p.communicate()[:2]
-
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        nvcc_stdout, nvcc_stderr = p.communicate()[:2]
+    finally:
+        os.chdir(orig_dir)
+        
    if nvcc_stdout:
        # this doesn't happen to my knowledge
        print >> sys.stderr, "DEBUG: nvcc STDOUT", nvcc_stdout