提交 1da867d8 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge remote-tracking branch 'delallea/surban-master'

......@@ -33,3 +33,4 @@ theano/version.py
theano/version.py.out
distribute-*.egg
distribute-*.tar.gz
Theano.suo
差异被折叠。

Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio 2010
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "Theano", "Theano.pyproj", "{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
......@@ -4,6 +4,7 @@ import errno
import os
import platform
import re
import sys
import theano
from theano.configparser import config, AddConfigVar, ConfigParam, StrParam
......@@ -14,7 +15,7 @@ def default_compiledirname():
platform.platform(),
platform.processor(),
platform.python_version()])
platform_id = re.sub("[\(\)\s]+", "_", platform_id)
platform_id = re.sub("[\(\)\s,]+", "_", platform_id)
return 'compiledir_' + platform_id
......@@ -50,9 +51,19 @@ def filter_compiledir(path):
return path
# TODO Using the local user profile on Windows is currently disabled as it
# is not documented yet, and may break some existing code. It will be enabled
# in a future code update.
if False and sys.platform == 'win32':
# On Windows we should not write temporary files to a directory
# that is part of the roaming part of the user profile. Instead
# we use the local part of the user profile.
basecompiledir = os.path.join(os.environ['LOCALAPPDATA'], 'theano')
else:
basecompiledir = os.path.join(config.home, '.theano')
AddConfigVar('base_compiledir',
"arch-independent cache directory for compiled modules",
StrParam(os.path.join(config.home, '.theano'), allow_override=False))
StrParam(basecompiledir, allow_override=False))
AddConfigVar('compiledir',
"arch-dependent cache directory for compiled modules",
......
import atexit, logging, os, stat, sys
import atexit, logging, os, shutil, stat, sys
from theano.compile import optdb
from theano.gof.cmodule import get_lib_extension
from theano.configparser import config, AddConfigVar, StrParam
......@@ -122,7 +122,11 @@ if cuda_available:
try:
open(libcuda_ndarray_so).close()
except IOError:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
if sys.platform == "win32":
# The Python `os` module does not support symlinks on win32.
shutil.copyfile(cuda_ndarray_so, libcuda_ndarray_so)
else:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
try:
gpu_init()
......
......@@ -471,7 +471,10 @@ class GpuSum(Op):
)
{
""" %locals()
print >> sio, "int new_dims[%(nd_out)s]; " % locals()
if nd_out > 0:
print >> sio, "int new_dims[%(nd_out)s]; " % locals()
else:
print >> sio, "int *new_dims=NULL; "
j = 0
for i in xrange(nd_in):
......
#define _CUDA_NDARRAY_C
#include <Python.h>
#include <structmember.h>
......@@ -3420,6 +3422,292 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
return NULL;
}
int
cnda_structure_size(int nd)
{
// dim0, dim1, ...
// str0, str1, ...
// log2(dim0), log2(dim1), ...
return nd + nd + nd;
}
const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self)
{
return self->host_structure;
}
const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{
return self->host_structure + self->nd;
}
const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{
return self->host_structure + 2*self->nd;
}
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
{
int verbose = 1;
if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
return 0;
}
if (cnda1->nd != cnda2->nd)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
return 0;
}
for (int i=0; i < 2*cnda1->nd; i++)
{
if (cnda1->host_structure[i] != cnda2->host_structure[i])
{
if(verbose)
fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
return 0;
}
}
if (!ignoreBase && cnda1->base != cnda2->base)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
return 0;
}
else if (cnda1->data_allocated != cnda2->data_allocated)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
return 0;
}
else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
// no need to check devdata if data is not allocated
return 0;
}
return 1;
}
int
CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
{
return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
}
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
int
cnda_copy_structure_to_device(CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
return -1;
}
self->dev_structure_fresh = 1;
return 0;
}
const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure;
}
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + self->nd;
}
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + 2*self->nd;
}
float *
CudaNdarray_DEV_DATA(const CudaNdarray * self)
{
return self->devdata;
}
/**
* Return the number of elements in the ndarray (product of the dimensions)
*/
int
CudaNdarray_SIZE(const CudaNdarray *self)
{
if (self->nd == -1) return 0;
int size = 1;
for (int i = 0; i < self->nd; ++i)
{
size *= CudaNdarray_HOST_DIMS(self)[i];
}
return size;
}
PyObject *
CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
{
return PyInt_FromLong(CudaNdarray_SIZE(self));
}
int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
}
PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
{
return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
}
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
}
fprintf(fd, "\n\tHOST_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
}
int data=0;
fprintf(fd, "\n\tDEV_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure+i, 1,
&data, 1);
fprintf(fd, "%i\t", data);
}
fprintf(fd, "\n\tDEV_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure + self->nd+i, 1,
&data, 1);
fprintf(fd, "%i \t", data);
}
fprintf(fd, "\n");
}
/*
Local Variables:
mode:c++
......
......@@ -534,33 +534,44 @@ class NaiveAlgo(object):
# collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides
if nd > 0:
print >> sio, "int local_dims[%(nd)s];" % locals()
else:
print >> sio, "int *local_dims=NULL;"
if nb_inputs > 0 and nd > 0:
print >> sio, """
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
""" % locals()
else:
print >> sio, """
int local_str[1][1];
int local_ostr[1][1];
"""
print >> sio, """
int local_dims[%(nd)s];
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
int nd_collapse = %(nd)s;
for(int i=0;i<%(nd)s;i++){//init new dim
local_dims[i]=dims[i];
}
"""%locals()
""" % locals()
for ipos in xrange(len(node.inputs)):
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_str[%(ipos)s][i]=i%(ipos)s_str[i];
}
"""%locals()
""" % locals()
for ipos in xrange(len(node.outputs)):
print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides
local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
}
"""%locals()
""" % locals()
if self.verbose>2:
print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
print >> sio, 'std::cerr << "local_dims";'
for d in xrange(nd):
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
print >> sio, 'std::cerr << "\\n";'
for ipos in xrange(len(node.inputs)):
......@@ -611,11 +622,18 @@ class NaiveAlgo(object):
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because we make less index calculation in the gpu.
print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};"
if nd > 0:
print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};"
else:
print >> sio, "int *nd_collapse_ = NULL;"
for ipos in xrange(len(node.inputs)):
if not _logical_scalar(node.inputs[ipos]):
print >> sio, """
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};"
if nd > 0:
print >> sio, """
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};"
else:
print >> sio, """
int *nd_collapse_%(ipos)s = NULL;"""%locals()
print >> sio, """
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){
......@@ -839,9 +857,14 @@ nd_collapse_[i]=0;
//std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks
""" %locals()
print >> sio, """
int dims[%(nd)s] = {%(initial_dims)s};
""" %locals()
if nd > 0:
print >> sio, """
int dims[%(nd)s] = {%(initial_dims)s};
""" % locals()
else:
print >> sio, """
int *dims = NULL;
"""
#check that all inputs have valid dimensions
emitted_inames = {}
......@@ -851,9 +874,14 @@ nd_collapse_[i]=0;
continue
broadcasts = ', '.join(map(str,map(int,node.inputs[id].broadcastable)))
nd = node.inputs[id].ndim
print >> sio, """
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
""" %locals()
if nd > 0:
print >> sio, """
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
""" % locals()
else:
print >> sio, """
int *broadcasts_%(iname)s = NULL;
""" % locals()
emitted_inames[iname] = node.inputs[id]
#check that all inputs have valid dimensions
emitted_inames = {}
......
......@@ -164,7 +164,12 @@ def nvcc_module_compile_str(
if config.nvcc.compiler_bindir:
cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
if sys.platform!='win32':
if sys.platform == 'win32':
# add flags for Microsoft compiler to create .pdb files
preargs2.append('/Zi')
cmd.extend(['-Xlinker', '/DEBUG'])
if sys.platform != 'win32':
if local_bitwidth() == 64:
cmd.append('-m64')
preargs2.append('-m64')
......@@ -180,8 +185,10 @@ def nvcc_module_compile_str(
if sys.platform != 'darwin':
# the 64bit CUDA libs are in the same files as are named by the function above
rpaths.append(os.path.join(config.cuda.root,'lib64'))
for rpath in rpaths:
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
if sys.platform != 'win32':
# the -rpath option is not understood by the Microsoft linker
for rpath in rpaths:
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
cmd.extend('-I%s'%idir for idir in include_dirs)
cmd.extend(['-o',lib_filename])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论