提交 1da867d8 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge remote-tracking branch 'delallea/surban-master'

...@@ -33,3 +33,4 @@ theano/version.py ...@@ -33,3 +33,4 @@ theano/version.py
theano/version.py.out theano/version.py.out
distribute-*.egg distribute-*.egg
distribute-*.tar.gz distribute-*.tar.gz
Theano.suo
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{b67d762d-0020-4e02-9ddf-7db4f89b1dd3}</ProjectGuid>
<ProjectHome>.</ProjectHome>
<StartupFile>
</StartupFile>
<SearchPath>
</SearchPath>
<WorkingDirectory>.</WorkingDirectory>
<OutputPath>.</OutputPath>
<Name>Theano</Name>
<RootNamespace>Theano</RootNamespace>
<IsWindowsApplication>False</IsWindowsApplication>
<InterpreterId>2af0f10d-7135-4994-9156-5d01c9c11b7e</InterpreterId>
<InterpreterVersion>2.7</InterpreterVersion>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<ItemGroup>
<Compile Include="theano\compile\builders.py" />
<Compile Include="theano\compile\debugmode.py" />
<Compile Include="theano\compile\function.py" />
<Compile Include="theano\compile\function_module.py" />
<Compile Include="theano\compile\io.py" />
<Compile Include="theano\compile\mode.py" />
<Compile Include="theano\compile\module.py" />
<Compile Include="theano\compile\pfunc.py" />
<Compile Include="theano\compile\profilemode.py" />
<Compile Include="theano\compile\profiling.py" />
<Compile Include="theano\compile\sandbox\__init__.py" />
<Compile Include="theano\compile\sharedvalue.py" />
<Compile Include="theano\compile\tests\test_builders.py" />
<Compile Include="theano\compile\tests\test_debugmode.py" />
<Compile Include="theano\compile\tests\test_function_module.py" />
<Compile Include="theano\compile\tests\test_inplace_opt_for_value.py" />
<Compile Include="theano\compile\tests\test_misc.py" />
<Compile Include="theano\compile\tests\test_modes.py" />
<Compile Include="theano\compile\tests\test_module.py" />
<Compile Include="theano\compile\tests\test_pfunc.py" />
<Compile Include="theano\compile\tests\test_shared.py" />
<Compile Include="theano\compile\tests\__init__.py" />
<Compile Include="theano\compile\__init__.py" />
<Compile Include="theano\configdefaults.py" />
<Compile Include="theano\configparser.py" />
<Compile Include="theano\gof\callcache.py" />
<Compile Include="theano\gof\cc.py" />
<Compile Include="theano\gof\cmodule.py" />
<Compile Include="theano\gof\compiledir.py" />
<Compile Include="theano\gof\compilelock.py" />
<Compile Include="theano\gof\cutils.py" />
<Compile Include="theano\gof\destroyhandler.py" />
<Compile Include="theano\gof\env.py" />
<Compile Include="theano\gof\graph.py" />
<Compile Include="theano\gof\lazylinker_c.py" />
<Compile Include="theano\gof\link.py" />
<Compile Include="theano\gof\op.py" />
<Compile Include="theano\gof\opt.py" />
<Compile Include="theano\gof\optdb.py" />
<Compile Include="theano\gof\python25.py" />
<Compile Include="theano\gof\sandbox\equilibrium.py" />
<Compile Include="theano\gof\tests\test_cc.py" />
<Compile Include="theano\gof\tests\test_compute_test_value.py" />
<Compile Include="theano\gof\tests\test_destroyhandler.py" />
<Compile Include="theano\gof\tests\test_graph.py" />
<Compile Include="theano\gof\tests\test_lazy.py" />
<Compile Include="theano\gof\tests\test_link.py" />
<Compile Include="theano\gof\tests\test_op.py" />
<Compile Include="theano\gof\tests\test_opt.py" />
<Compile Include="theano\gof\tests\test_optdb.py" />
<Compile Include="theano\gof\tests\test_toolbox.py" />
<Compile Include="theano\gof\tests\test_types.py" />
<Compile Include="theano\gof\tests\test_vm.py" />
<Compile Include="theano\gof\tests\__init__.py" />
<Compile Include="theano\gof\toolbox.py" />
<Compile Include="theano\gof\type.py" />
<Compile Include="theano\gof\unify.py" />
<Compile Include="theano\gof\utils.py" />
<Compile Include="theano\gof\vm.py" />
<Compile Include="theano\gof\__init__.py" />
<Compile Include="theano\gradient.py" />
<Compile Include="theano\ifelse.py" />
<Compile Include="theano\misc\buildbot_filter.py" />
<Compile Include="theano\misc\check_blas.py" />
<Compile Include="theano\misc\check_duplicate_key.py" />
<Compile Include="theano\misc\cudamat_utils.py" />
<Compile Include="theano\misc\gnumpy_utils.py" />
<Compile Include="theano\misc\hooks\argparse.py" />
<Compile Include="theano\misc\hooks\check_whitespace.py" />
<Compile Include="theano\misc\hooks\reindent.py" />
<Compile Include="theano\misc\latence_gpu_transfert.py" />
<Compile Include="theano\misc\may_share_memory.py" />
<Compile Include="theano\misc\pycuda_example.py" />
<Compile Include="theano\misc\pycuda_init.py" />
<Compile Include="theano\misc\pycuda_utils.py" />
<Compile Include="theano\misc\safe_asarray.py" />
<Compile Include="theano\misc\strutil.py" />
<Compile Include="theano\misc\tests\test_cudamat_utils.py" />
<Compile Include="theano\misc\tests\test_gnumpy_utils.py" />
<Compile Include="theano\misc\tests\test_may_share_memory.py" />
<Compile Include="theano\misc\tests\test_pycuda_example.py" />
<Compile Include="theano\misc\tests\test_pycuda_theano_simple.py" />
<Compile Include="theano\misc\tests\test_pycuda_utils.py" />
<Compile Include="theano\misc\__init__.py" />
<Compile Include="theano\printing.py" />
<Compile Include="theano\raise_op.py" />
<Compile Include="theano\sandbox\conv.py" />
<Compile Include="theano\sandbox\cuda\basic_ops.py" />
<Compile Include="theano\sandbox\cuda\blas.py" />
<Compile Include="theano\sandbox\cuda\elemwise.py" />
<Compile Include="theano\sandbox\cuda\GpuConv3D.py" />
<Compile Include="theano\sandbox\cuda\GpuConvGrad3D.py" />
<Compile Include="theano\sandbox\cuda\GpuConvTransp3D.py" />
<Compile Include="theano\sandbox\cuda\kernel_codegen.py" />
<Compile Include="theano\sandbox\cuda\nnet.py" />
<Compile Include="theano\sandbox\cuda\nvcc_compiler.py" />
<Compile Include="theano\sandbox\cuda\opt.py" />
<Compile Include="theano\sandbox\cuda\rng_curand.py" />
<Compile Include="theano\sandbox\cuda\type.py" />
<Compile Include="theano\sandbox\cuda\var.py" />
<Compile Include="theano\sandbox\cuda\__init__.py" />
<Compile Include="theano\sandbox\debug.py" />
<Compile Include="theano\sandbox\downsample.py" />
<Compile Include="theano\sandbox\fourier.py" />
<Compile Include="theano\sandbox\linalg\ops.py" />
<Compile Include="theano\sandbox\linalg\__init__.py" />
<Compile Include="theano\sandbox\minimal.py" />
<Compile Include="theano\sandbox\multinomial.py" />
<Compile Include="theano\sandbox\neighbourhoods.py" />
<Compile Include="theano\sandbox\neighbours.py" />
<Compile Include="theano\sandbox\rng_mrg.py" />
<Compile Include="theano\sandbox\softsign.py" />
<Compile Include="theano\sandbox\solve.py" />
<Compile Include="theano\sandbox\symbolic_module.py" />
<Compile Include="theano\sandbox\test_multinomial.py" />
<Compile Include="theano\sandbox\test_neighbourhoods.py" />
<Compile Include="theano\sandbox\test_neighbours.py" />
<Compile Include="theano\sandbox\test_rng_mrg.py" />
<Compile Include="theano\sandbox\test_theano_object.py" />
<Compile Include="theano\sandbox\theano_object.py" />
<Compile Include="theano\sandbox\__init__.py" />
<Compile Include="theano\scalar\basic.py" />
<Compile Include="theano\scalar\basic_scipy.py" />
<Compile Include="theano\scalar\sharedvar.py" />
<Compile Include="theano\scalar\__init__.py" />
<Compile Include="theano\scan_module\scan.py" />
<Compile Include="theano\scan_module\scan_op.py" />
<Compile Include="theano\scan_module\scan_opt.py" />
<Compile Include="theano\scan_module\scan_perform_ext.py" />
<Compile Include="theano\scan_module\scan_utils.py" />
<Compile Include="theano\scan_module\scan_views.py" />
<Compile Include="theano\scan_module\__init__.py" />
<Compile Include="theano\sparse\basic.py" />
<Compile Include="theano\sparse\sandbox\sp.py" />
<Compile Include="theano\sparse\sandbox\test_sp.py" />
<Compile Include="theano\sparse\sandbox\truedot.py" />
<Compile Include="theano\sparse\sandbox\__init__.py" />
<Compile Include="theano\sparse\sharedvar.py" />
<Compile Include="theano\sparse\__init__.py" />
<Compile Include="theano\tensor\basic.py" />
<Compile Include="theano\tensor\blas.py" />
<Compile Include="theano\tensor\blas_headers.py" />
<Compile Include="theano\tensor\blas_scipy.py" />
<Compile Include="theano\tensor\deprecated\rmodule.py" />
<Compile Include="theano\tensor\deprecated\test_rmodule.py" />
<Compile Include="theano\tensor\deprecated\__init__.py" />
<Compile Include="theano\tensor\elemwise.py" />
<Compile Include="theano\tensor\elemwise_cgen.py" />
<Compile Include="theano\tensor\inplace.py" />
<Compile Include="theano\tensor\nnet\conv.py" />
<Compile Include="theano\tensor\nnet\Conv3D.py" />
<Compile Include="theano\tensor\nnet\ConvGrad3D.py" />
<Compile Include="theano\tensor\nnet\ConvTransp3D.py" />
<Compile Include="theano\tensor\nnet\nnet.py" />
<Compile Include="theano\tensor\nnet\sigm.py" />
<Compile Include="theano\tensor\nnet\__init__.py" />
<Compile Include="theano\tensor\opt.py" />
<Compile Include="theano\tensor\opt_uncanonicalize.py" />
<Compile Include="theano\tensor\randomstreams.py" />
<Compile Include="theano\tensor\raw_random.py" />
<Compile Include="theano\tensor\sharedvar.py" />
<Compile Include="theano\tensor\shared_randomstreams.py" />
<Compile Include="theano\tensor\signal\conv.py" />
<Compile Include="theano\tensor\signal\downsample.py" />
<Compile Include="theano\tensor\signal\__init__.py" />
<Compile Include="theano\tensor\tensor_grad.py" />
<Compile Include="theano\tensor\xlogx.py" />
<Compile Include="theano\tensor\__init__.py" />
<Compile Include="theano\updates.py" />
<Compile Include="theano\__init__.py" />
</ItemGroup>
<ItemGroup>
<Folder Include="theano\" />
<Folder Include="theano\compile\" />
<Folder Include="theano\compile\sandbox\" />
<Folder Include="theano\compile\tests\" />
<Folder Include="theano\gof\" />
<Folder Include="theano\gof\sandbox\" />
<Folder Include="theano\gof\tests\" />
<Folder Include="theano\misc\" />
<Folder Include="theano\misc\hooks\" />
<Folder Include="theano\misc\tests\" />
<Folder Include="theano\sandbox\" />
<Folder Include="theano\sandbox\cuda\" />
<Folder Include="theano\sandbox\linalg\" />
<Folder Include="theano\scalar\" />
<Folder Include="theano\scan_module\" />
<Folder Include="theano\sparse\" />
<Folder Include="theano\sparse\sandbox\" />
<Folder Include="theano\tensor\" />
<Folder Include="theano\tensor\deprecated\" />
<Folder Include="theano\tensor\nnet\" />
<Folder Include="theano\tensor\signal\" />
</ItemGroup>
<ItemGroup>
<Content Include="theano\sandbox\cuda\conv.cu" />
<Content Include="theano\sandbox\cuda\conv_full_kernel.cu" />
<Content Include="theano\sandbox\cuda\conv_kernel.cu" />
<Content Include="theano\sandbox\cuda\cuda_ndarray.cu" />
<Content Include="theano\sandbox\cuda\cuda_ndarray.cuh" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" />
</Project>
\ No newline at end of file

Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio 2010
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "Theano", "Theano.pyproj", "{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B67D762D-0020-4E02-9DDF-7DB4F89B1DD3}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
...@@ -4,6 +4,7 @@ import errno ...@@ -4,6 +4,7 @@ import errno
import os import os
import platform import platform
import re import re
import sys
import theano import theano
from theano.configparser import config, AddConfigVar, ConfigParam, StrParam from theano.configparser import config, AddConfigVar, ConfigParam, StrParam
...@@ -14,7 +15,7 @@ def default_compiledirname(): ...@@ -14,7 +15,7 @@ def default_compiledirname():
platform.platform(), platform.platform(),
platform.processor(), platform.processor(),
platform.python_version()]) platform.python_version()])
platform_id = re.sub("[\(\)\s]+", "_", platform_id) platform_id = re.sub("[\(\)\s,]+", "_", platform_id)
return 'compiledir_' + platform_id return 'compiledir_' + platform_id
...@@ -50,9 +51,19 @@ def filter_compiledir(path): ...@@ -50,9 +51,19 @@ def filter_compiledir(path):
return path return path
# TODO Using the local user profile on Windows is currently disabled as it
# is not documented yet, and may break some existing code. It will be enabled
# in a future code update.
if False and sys.platform == 'win32':
# On Windows we should not write temporary files to a directory
# that is part of the roaming part of the user profile. Instead
# we use the local part of the user profile.
basecompiledir = os.path.join(os.environ['LOCALAPPDATA'], 'theano')
else:
basecompiledir = os.path.join(config.home, '.theano')
AddConfigVar('base_compiledir', AddConfigVar('base_compiledir',
"arch-independent cache directory for compiled modules", "arch-independent cache directory for compiled modules",
StrParam(os.path.join(config.home, '.theano'), allow_override=False)) StrParam(basecompiledir, allow_override=False))
AddConfigVar('compiledir', AddConfigVar('compiledir',
"arch-dependent cache directory for compiled modules", "arch-dependent cache directory for compiled modules",
......
import atexit, logging, os, stat, sys import atexit, logging, os, shutil, stat, sys
from theano.compile import optdb from theano.compile import optdb
from theano.gof.cmodule import get_lib_extension from theano.gof.cmodule import get_lib_extension
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam
...@@ -122,7 +122,11 @@ if cuda_available: ...@@ -122,7 +122,11 @@ if cuda_available:
try: try:
open(libcuda_ndarray_so).close() open(libcuda_ndarray_so).close()
except IOError: except IOError:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so) if sys.platform == "win32":
# The Python `os` module does not support symlinks on win32.
shutil.copyfile(cuda_ndarray_so, libcuda_ndarray_so)
else:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
try: try:
gpu_init() gpu_init()
......
...@@ -471,7 +471,10 @@ class GpuSum(Op): ...@@ -471,7 +471,10 @@ class GpuSum(Op):
) )
{ {
""" %locals() """ %locals()
print >> sio, "int new_dims[%(nd_out)s]; " % locals() if nd_out > 0:
print >> sio, "int new_dims[%(nd_out)s]; " % locals()
else:
print >> sio, "int *new_dims=NULL; "
j = 0 j = 0
for i in xrange(nd_in): for i in xrange(nd_in):
......
#define _CUDA_NDARRAY_C
#include <Python.h> #include <Python.h>
#include <structmember.h> #include <structmember.h>
...@@ -3420,6 +3422,292 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args) ...@@ -3420,6 +3422,292 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
return NULL; return NULL;
} }
int
cnda_structure_size(int nd)
{
// dim0, dim1, ...
// str0, str1, ...
// log2(dim0), log2(dim1), ...
return nd + nd + nd;
}
const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self)
{
return self->host_structure;
}
const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{
return self->host_structure + self->nd;
}
const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{
return self->host_structure + 2*self->nd;
}
void
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
{
int verbose = 1;
if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
return 0;
}
if (cnda1->nd != cnda2->nd)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
return 0;
}
for (int i=0; i < 2*cnda1->nd; i++)
{
if (cnda1->host_structure[i] != cnda2->host_structure[i])
{
if(verbose)
fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
return 0;
}
}
if (!ignoreBase && cnda1->base != cnda2->base)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
return 0;
}
else if (cnda1->data_allocated != cnda2->data_allocated)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
return 0;
}
else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
// no need to check devdata if data is not allocated
return 0;
}
return 1;
}
int
CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
{
return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
}
void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
int
cnda_copy_structure_to_device(CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
return -1;
}
self->dev_structure_fresh = 1;
return 0;
}
const int *
CudaNdarray_DEV_DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure;
}
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + self->nd;
}
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + 2*self->nd;
}
float *
CudaNdarray_DEV_DATA(const CudaNdarray * self)
{
return self->devdata;
}
/**
* Return the number of elements in the ndarray (product of the dimensions)
*/
int
CudaNdarray_SIZE(const CudaNdarray *self)
{
if (self->nd == -1) return 0;
int size = 1;
for (int i = 0; i < self->nd; ++i)
{
size *= CudaNdarray_HOST_DIMS(self)[i];
}
return size;
}
PyObject *
CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
{
return PyInt_FromLong(CudaNdarray_SIZE(self));
}
int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
}
PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
{
return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
}
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
}
fprintf(fd, "\n\tHOST_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
}
int data=0;
fprintf(fd, "\n\tDEV_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure+i, 1,
&data, 1);
fprintf(fd, "%i\t", data);
}
fprintf(fd, "\n\tDEV_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure + self->nd+i, 1,
&data, 1);
fprintf(fd, "%i \t", data);
}
fprintf(fd, "\n");
}
/* /*
Local Variables: Local Variables:
mode:c++ mode:c++
......
...@@ -6,6 +6,16 @@ ...@@ -6,6 +6,16 @@
#include <cublas.h> #include <cublas.h>
#ifdef _WIN32
#ifdef _CUDA_NDARRAY_C
#define DllExport __declspec( dllexport )
#else
#define DllExport __declspec( dllimport )
#endif
#else
#define DllExport
#endif
typedef float real; typedef float real;
#define REAL_TYPENUM 11 #define REAL_TYPENUM 11
...@@ -36,8 +46,8 @@ typedef float real; ...@@ -36,8 +46,8 @@ typedef float real;
* device_malloc will set the Python error message before returning None. * device_malloc will set the Python error message before returning None.
* device_free will return nonzero on failure (after setting the python error message) * device_free will return nonzero on failure (after setting the python error message)
*/ */
void * device_malloc(size_t size); DllExport void * device_malloc(size_t size);
int device_free(void * ptr); DllExport int device_free(void * ptr);
template <typename T> template <typename T>
static T ceil_intdiv(T a, T b) static T ceil_intdiv(T a, T b)
...@@ -81,114 +91,50 @@ struct CudaNdarray ...@@ -81,114 +91,50 @@ struct CudaNdarray
* Return a CudaNdarray whose 'nd' dimensions are all 0. * Return a CudaNdarray whose 'nd' dimensions are all 0.
* if nd==-1, it is not initialized. * if nd==-1, it is not initialized.
*/ */
PyObject * DllExport PyObject *
CudaNdarray_New(int nd=-1); CudaNdarray_New(int nd=-1);
/** /**
* Return 1 for a CudaNdarray otw 0 * Return 1 for a CudaNdarray otw 0
*/ */
int DllExport int
CudaNdarray_Check(const PyObject * ob); CudaNdarray_Check(const PyObject * ob);
/** /**
* Return 1 for a CudaNdarray otw 0 * Return 1 for a CudaNdarray otw 0
*/ */
int DllExport int
CudaNdarray_CheckExact(const PyObject * ob); CudaNdarray_CheckExact(const PyObject * ob);
/** /**
* Return true for a C-contiguous CudaNdarray, else false * Return true for a C-contiguous CudaNdarray, else false
*/ */
bool DllExport bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self); CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/**** /****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions. * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/ */
int DllExport int cnda_structure_size(int nd);
cnda_structure_size(int nd)
{
// dim0, dim1, ...
// str0, str1, ...
// log2(dim0), log2(dim1), ...
return nd + nd + nd;
}
const int * DllExport const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self) CudaNdarray_HOST_DIMS(const CudaNdarray * self);
{
return self->host_structure;
}
const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
{
return self->host_structure + self->nd;
}
const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
{
return self->host_structure + 2*self->nd;
}
void DllExport const int *
cnda_mark_dev_structure_dirty(CudaNdarray * self) CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
{
self->dev_structure_fresh = 0;
}
int DllExport const int *
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase) CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
{
int verbose = 1;
if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
return 0;
}
if (cnda1->nd != cnda2->nd)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
return 0;
}
for (int i=0; i < 2*cnda1->nd; i++)
{
if (cnda1->host_structure[i] != cnda2->host_structure[i])
{
if(verbose)
fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
return 0;
}
}
if (!ignoreBase && cnda1->base != cnda2->base) DllExport void
{ cnda_mark_dev_structure_dirty(CudaNdarray * self);
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
return 0;
}
else if (cnda1->data_allocated != cnda2->data_allocated)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
return 0;
}
else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
{
if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
// no need to check devdata if data is not allocated
return 0;
}
return 1; DllExport int
} CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
// Default: do not ignore sync of dev and host structures in comparing, and do not ignore difference in base pointers // Default: do not ignore sync of dev and host structures in comparing, and do not ignore difference in base pointers
int DllExport int
CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2) CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
{
return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
}
/**** /****
* Set the idx'th dimension to value d. * Set the idx'th dimension to value d.
...@@ -197,173 +143,44 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2) ...@@ -197,173 +143,44 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
* *
* Does not sync structure to host. * Does not sync structure to host.
*/ */
void DllExport void
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) CudaNdarray_set_dim(CudaNdarray * self, int idx, int d);
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
}
if (d != self->host_structure[idx]) DllExport void
{ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s);
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
void
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
/*** /***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self) * Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
* *
* This means: recalculate the log2dims and transfer structure to the card * This means: recalculate the log2dims and transfer structure to the card
*/ */
int DllExport int cnda_copy_structure_to_device(CudaNdarray * self);
cnda_copy_structure_to_device(CudaNdarray * self)
{
cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
return -1;
}
self->dev_structure_fresh = 1;
return 0;
}
const int * DllExport const int *CudaNdarray_DEV_DIMS(CudaNdarray * self);
CudaNdarray_DEV_DIMS(CudaNdarray * self) DllExport const int *CudaNdarray_DEV_STRIDES(CudaNdarray * self);
{ DllExport const int *CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self);
if (!self->dev_structure_fresh) DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure;
}
const int *
CudaNdarray_DEV_STRIDES(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + self->nd;
}
const int *
CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
{
if (!self->dev_structure_fresh)
{
if (cnda_copy_structure_to_device(self))
return NULL;
}
return self->dev_structure + 2*self->nd;
}
float *
CudaNdarray_DEV_DATA(const CudaNdarray * self)
{
return self->devdata;
}
/** /**
* Return the number of elements in the ndarray (product of the dimensions) * Return the number of elements in the ndarray (product of the dimensions)
*/ */
int DllExport int CudaNdarray_SIZE(const CudaNdarray *self);
CudaNdarray_SIZE(const CudaNdarray *self)
{
if (self->nd == -1) return 0;
int size = 1;
for (int i = 0; i < self->nd; ++i)
{
size *= CudaNdarray_HOST_DIMS(self)[i];
}
return size;
}
static PyObject *
CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
{
return PyInt_FromLong(CudaNdarray_SIZE(self));
}
static PyObject *CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure);
/** /**
* Allocate a new CudaNdarray with room for given number of dimensions * Allocate a new CudaNdarray with room for given number of dimensions
* *
* No Storage space is allocated (and all dimensions are 0) * No Storage space is allocated (and all dimensions are 0)
*/ */
PyObject * CudaNdarray_new_nd(const int nd); DllExport PyObject * CudaNdarray_new_nd(const int nd);
/** /**
* [Re]allocate a CudaNdarray with access to 'nd' dimensions. * [Re]allocate a CudaNdarray with access to 'nd' dimensions.
* *
* Note: This does not allocate storage for data. * Note: This does not allocate storage for data.
*/ */
int CudaNdarray_set_nd(CudaNdarray * self, const int nd) DllExport int CudaNdarray_set_nd(CudaNdarray * self, const int nd);
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
int struct_size = cnda_structure_size(nd);
if (struct_size)
{
self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
if (NULL == self->dev_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->dev_structure = NULL;
return -1;
}
}
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
/** /**
* CudaNdarray_alloc_contiguous * CudaNdarray_alloc_contiguous
...@@ -373,7 +190,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -373,7 +190,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/ */
template<typename inttype> template<typename inttype>
int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim) static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
{ {
// allocate an empty ndarray with c_contiguous access // allocate an empty ndarray with c_contiguous access
// return 0 on success // return 0 on success
...@@ -434,9 +251,8 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype ...@@ -434,9 +251,8 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
/* /*
* Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated. * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
*/ */
template<typename inttype> template<typename inttype>
PyObject * static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
CudaNdarray_NewDims(int nd, const inttype * dims)
{ {
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(); CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
if (rval) if (rval)
...@@ -456,103 +272,64 @@ CudaNdarray_NewDims(int nd, const inttype * dims) ...@@ -456,103 +272,64 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
* *
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`. * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/ */
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base); DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base) DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
{
return CudaNdarray_set_device_data(self, data, (PyObject *) base);
}
/** /**
* Return an independent copy of self * Return an independent copy of self
*/ */
PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo); DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/** /**
* Return an independent copy of self * Return an independent copy of self
*/ */
PyObject * CudaNdarray_Copy(CudaNdarray * self); DllExport PyObject * CudaNdarray_Copy(CudaNdarray * self);
/** /**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask. * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
*/ */
PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask); DllExport PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask);
/** /**
* Transfer the contents of numpy array `obj` to `self`. * Transfer the contents of numpy array `obj` to `self`.
* *
* self is reallocated to have the correct dimensions if necessary. * self is reallocated to have the correct dimensions if necessary.
*/ */
int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj); DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
/** /**
* Transfer the contents of CudaNdarray `other` to `self`. * Transfer the contents of CudaNdarray `other` to `self`.
* *
* self is reallocated to have the correct dimensions if necessary. * self is reallocated to have the correct dimensions if necessary.
*/ */
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false); DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, bool unbroadcast = false);
/** /**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray. * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
*/ */
PyObject * DllExport PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self); CudaNdarray_CreateArrayObj(CudaNdarray * self);
PyObject * DllExport PyObject *
CudaNdarray_ZEROS(int n, int * dims); CudaNdarray_ZEROS(int n, int * dims);
/** /**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1] * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/ */
bool CudaNdarray_is_c_contiguous(const CudaNdarray * self); DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self) DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
{
return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
}
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern); DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
fprintf(fd, "\tHOST_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
}
fprintf(fd, "\n\tHOST_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
}
int data=0; static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
fprintf(fd, "\n\tDEV_DIMS: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure+i, 1,
&data, 1);
fprintf(fd, "%i\t", data);
}
fprintf(fd, "\n\tDEV_STRIDES: ");
for (int i = 0; i < self->nd; ++i)
{
cublasGetVector(1, sizeof(int),
self->dev_structure + self->nd+i, 1,
&data, 1);
fprintf(fd, "%i \t", data);
}
fprintf(fd, "\n");
}
#endif #endif
/* /*
......
...@@ -534,33 +534,44 @@ class NaiveAlgo(object): ...@@ -534,33 +534,44 @@ class NaiveAlgo(object):
# collapse dimension that are broadcast in all inputs. # collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it. # need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides # do the dimensions and the strides
if nd > 0:
print >> sio, "int local_dims[%(nd)s];" % locals()
else:
print >> sio, "int *local_dims=NULL;"
if nb_inputs > 0 and nd > 0:
print >> sio, """
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
""" % locals()
else:
print >> sio, """
int local_str[1][1];
int local_ostr[1][1];
"""
print >> sio, """ print >> sio, """
int local_dims[%(nd)s];
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
int nd_collapse = %(nd)s; int nd_collapse = %(nd)s;
for(int i=0;i<%(nd)s;i++){//init new dim for(int i=0;i<%(nd)s;i++){//init new dim
local_dims[i]=dims[i]; local_dims[i]=dims[i];
} }
"""%locals() """ % locals()
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
print >> sio, """ print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides for(int i=0;i<%(nd)s;i++){//init new strides
local_str[%(ipos)s][i]=i%(ipos)s_str[i]; local_str[%(ipos)s][i]=i%(ipos)s_str[i];
} }
"""%locals() """ % locals()
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print >> sio, """ print >> sio, """
for(int i=0;i<%(nd)s;i++){//init new strides for(int i=0;i<%(nd)s;i++){//init new strides
local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
} }
"""%locals() """ % locals()
if self.verbose>2: if self.verbose>2:
print >>sio, 'std::cerr <<"before broadcast collapse\\n";' print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
print >> sio, 'std::cerr << "local_dims";' print >> sio, 'std::cerr << "local_dims";'
for d in xrange(nd): for d in xrange(nd):
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals() print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
print >> sio, 'std::cerr << "\\n";' print >> sio, 'std::cerr << "\\n";'
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
...@@ -611,11 +622,18 @@ class NaiveAlgo(object): ...@@ -611,11 +622,18 @@ class NaiveAlgo(object):
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because we make less index calculation in the gpu. # this is a good idea because we make less index calculation in the gpu.
print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};" if nd > 0:
print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in xrange(nd)]) +"};"
else:
print >> sio, "int *nd_collapse_ = NULL;"
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
if not _logical_scalar(node.inputs[ipos]): if not _logical_scalar(node.inputs[ipos]):
print >> sio, """ if nd > 0:
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};" print >> sio, """
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in xrange(nd)]) +"};"
else:
print >> sio, """
int *nd_collapse_%(ipos)s = NULL;"""%locals()
print >> sio, """ print >> sio, """
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){ for(int i=0;i<nd_collapse;i++){
...@@ -839,9 +857,14 @@ nd_collapse_[i]=0; ...@@ -839,9 +857,14 @@ nd_collapse_[i]=0;
//std::cerr << "C_CODE %(opname)s START\\n"; //std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks //standard elemwise size checks
""" %locals() """ %locals()
print >> sio, """ if nd > 0:
int dims[%(nd)s] = {%(initial_dims)s}; print >> sio, """
""" %locals() int dims[%(nd)s] = {%(initial_dims)s};
""" % locals()
else:
print >> sio, """
int *dims = NULL;
"""
#check that all inputs have valid dimensions #check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
...@@ -851,9 +874,14 @@ nd_collapse_[i]=0; ...@@ -851,9 +874,14 @@ nd_collapse_[i]=0;
continue continue
broadcasts = ', '.join(map(str,map(int,node.inputs[id].broadcastable))) broadcasts = ', '.join(map(str,map(int,node.inputs[id].broadcastable)))
nd = node.inputs[id].ndim nd = node.inputs[id].ndim
print >> sio, """ if nd > 0:
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; print >> sio, """
""" %locals() int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
""" % locals()
else:
print >> sio, """
int *broadcasts_%(iname)s = NULL;
""" % locals()
emitted_inames[iname] = node.inputs[id] emitted_inames[iname] = node.inputs[id]
#check that all inputs have valid dimensions #check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
......
...@@ -164,7 +164,12 @@ def nvcc_module_compile_str( ...@@ -164,7 +164,12 @@ def nvcc_module_compile_str(
if config.nvcc.compiler_bindir: if config.nvcc.compiler_bindir:
cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir]) cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
if sys.platform!='win32': if sys.platform == 'win32':
# add flags for Microsoft compiler to create .pdb files
preargs2.append('/Zi')
cmd.extend(['-Xlinker', '/DEBUG'])
if sys.platform != 'win32':
if local_bitwidth() == 64: if local_bitwidth() == 64:
cmd.append('-m64') cmd.append('-m64')
preargs2.append('-m64') preargs2.append('-m64')
...@@ -180,8 +185,10 @@ def nvcc_module_compile_str( ...@@ -180,8 +185,10 @@ def nvcc_module_compile_str(
if sys.platform != 'darwin': if sys.platform != 'darwin':
# the 64bit CUDA libs are in the same files as are named by the function above # the 64bit CUDA libs are in the same files as are named by the function above
rpaths.append(os.path.join(config.cuda.root,'lib64')) rpaths.append(os.path.join(config.cuda.root,'lib64'))
for rpath in rpaths: if sys.platform != 'win32':
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])]) # the -rpath option is not understood by the Microsoft linker
for rpath in rpaths:
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag]) cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
cmd.extend('-I%s'%idir for idir in include_dirs) cmd.extend('-I%s'%idir for idir in include_dirs)
cmd.extend(['-o',lib_filename]) cmd.extend(['-o',lib_filename])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论