提交 709c14cc authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3091 from nouiz/abergeron-gpuarray_dnn

gpuarray in new back-end
......@@ -56,7 +56,7 @@ if pygpu:
init_dev(config.device)
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
optdb.add_tags('gpuarray', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device)
......
......@@ -783,6 +783,10 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
raise NotImplementedError("grad disabled")
def empty_like(var):
return GpuAllocEmpty(var.type.dtype)(*var.shape)
class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
......
......@@ -29,12 +29,12 @@ class NVCC_compiler(NVCC_base):
# exist in the past
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
if bool(numpy_ver < [1, 7]):
flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
flags.append("-DNPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
flags.append("-DNPY_ARRAY_ALIGNED=NPY_ALIGNED")
flags.append("-DNPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
flags.append("-DNPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
flags.append("-DNPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
flags.append("-DNPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
# If the user didn't specify architecture flags add them
if not any(['-arch=sm_' in f for f in flags]):
......
#ifndef CUDNN_HELPER_H
#define CUDNN_HELPER_H
#include <cudnn.h>
#ifndef CUDNN_VERSION
#include <assert.h>
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
switch (err) {
case CUDNN_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
}
// some macros to help support cudnn R1 while using R2 code.
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t
cudnnGetConvolution2dForwardOutputDim(
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t;
typedef int cudnnConvolutionFwdPreference_t;
#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
static inline cudnnStatus_t
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc,
cudnnConvolutionFwdPreference_t preference,
size_t memoryLimitInbytes,
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
r);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
r);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
(cudnnTensorDescriptor_t)diffDesc,
diffData,
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
r);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
// r2 rc1 and rc2 do not have the same macro defined
// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
#endif
#endif
差异被折叠。
#section support_code
static cudnnHandle_t _handle = NULL;
static int
c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
cudnnDataType_t dt;
size_t ds;
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
return -1;
}
ds = gpuarray_get_elsize(var->ga.typecode);
int str0, str1, str2, str3;
// cudnn do not like 0s in strides
str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
desc, dt,
PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
str0, str1, str2, str3);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
cudnnDataType_t dt;
if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
switch (var->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1;
}
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, dt,
PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.",
cudnnGetErrorString(err));
return -1;
}
return 0;
}
#section init_code
{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
#if PY_MAJOR_VERSION >= 3
return NULL;
#else
return;
#endif
}
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code_struct
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *om,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*output, om))
return 1;
#endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
{
size_t worksize;
gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CONV_ALGO,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
/*
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in.
*/
if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
return 1;
}
}
err = cudnnConvolutionForward(
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, CONV_ALGO,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
if (worksize != 0)
c->ops->buffer_release(workspace);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
switch (im->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
err = cudnnConvolutionBackwardData(
_handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
beta_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size");
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
return 1;
switch (input->ga.typecode) {
case GA_DOUBLE:
alpha_p = (void *)&alpha;
beta_p = (void *)&beta;
break;
case GA_FLOAT:
alpha_p = (void *)&af;
beta_p = (void *)&bf;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
return 1;
}
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER,
pygpu_default_context()) != 0)
return 1;
if (beta != 0.0 && pygpu_move(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
err = cudnnConvolutionBackwardFilter(
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
desc,
beta_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
......@@ -42,4 +42,9 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
return out;
}
/* This is guaranteed to work and return the raw CUDA/OpenCL object on
* all recent (as of June 2015) version of libgpuarray. This is also
* promised to keep working in future versions. */
#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data))
#endif
......@@ -12,11 +12,13 @@ from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant
from .basic_ops import (host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost,
......@@ -39,6 +41,10 @@ gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
......@@ -689,6 +695,9 @@ def local_gpu_conv(node):
out.values_eq_approx = values_eq_approx
return [out]
# Register this here so that it goes after 'local_gpu_conv'
register_opt()(conv_groupopt)
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
......
......@@ -7,10 +7,10 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
from .basic_ops import GpuFromHost, HostFromGpu
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
_one = scal.constant(numpy.asarray(1.0, dtype='float64'))
def grab_cpu_scalar(v, nd):
......@@ -18,10 +18,10 @@ def grab_cpu_scalar(v, nd):
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
return grab_cpu_scalar(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
return grab_cpu_scalar(n.inputs[0])
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
......@@ -37,7 +37,7 @@ def find_node(v, cls, ignore_clients=False):
# that has the op class specified. If ignore_clients is False (the
# default) it will only dig through nodes that have a single
# client.
if v.owner is not None and (ignore_clients or v.clients == 1):
if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
......
差异被折叠。
from __future__ import print_function
from nose.plugins.skip import SkipTest
import numpy
import unittest
import theano
import theano.tensor as T
......@@ -11,12 +12,13 @@ from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from ..nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
"""
......@@ -290,3 +292,96 @@ def softmax_unittest_template(dtypeInput):
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
class test_SoftMax(unittest.TestCase):
gpu_op = GpuSoftmax
mode = mode_wo_cudnn
def _test_softmax(
self,
x,
x_gpu,
f_z,
f_gpu_z,
cmp
):
"""
This is basic test for GpuSoftmax and GpuDnnSoftmax
We check that we loop when there is too much block
We use slower code when there isn't enough shared memory
"""
f_z_out = f_z(x)
f_gpu_z_out = f_gpu_z(x_gpu)
f = theano.function([x], f_z_out, mode=mode_without_gpu)
f_gpu = theano.function([x_gpu], f_gpu_z_out, mode=self.mode)
self._check_types(f, f_gpu, T.nnet.Softmax, self.gpu_op)
# we need to test n>32*1024 to check that we make the block loop.
cmp(1, 5, f, f_gpu)
cmp(2, 5, f, f_gpu)
cmp(10, 5, f, f_gpu)
cmp(100, 5, f, f_gpu)
cmp(1000, 5, f, f_gpu)
cmp(10000, 5, f, f_gpu)
cmp(4074, 400, f, f_gpu)
cmp(784, 784, f, f_gpu)
cmp(4, 1000, f, f_gpu)
cmp(4, 1024, f, f_gpu)
cmp(4, 2000, f, f_gpu)
cmp(4, 2024, f, f_gpu)
# The GTX285 don't have enough shared memory.
cmp(4, 4074, f, f_gpu)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000, f, f_gpu)
cmp(128, 16 * 1024, f, f_gpu)
cmp(128, 64 * 1024, f, f_gpu)
# cudnn permits no more than 2^15 - 1 rows
cmp((2 << 15) - 1, 5, f, f_gpu)
cmp(5, 2 << 15, f, f_gpu)
return f, f_gpu
def _cmp(self, n, m, f, f_gpu):
# print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
assert len([node for node in graph_gpu.maker.fgraph.toposort()
if isinstance(node.op, f_gpu_type)]) == 1
def test_softmax(self):
x = T.fmatrix('x')
z = T.nnet.softmax_op
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp
)
# cuDNN R1 cannot handle these test cases but the Theano softmax can so
# we test them only for the Theano softmax.
self._cmp(2 << 15, 5, f, f_gpu)
def test_softmax_shape_0(self):
x = T.fmatrix('x')
z = T.nnet.softmax_op
f, f_gpu = self._test_softmax(
x,
x,
z,
z,
self._cmp
)
# Theano can handle that case, but cudnn can't
self._cmp(0, 10, f, f_gpu)
......@@ -593,7 +593,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
# mess with the stabilization optimization and be too slow.
# We put all the scalar Ops used by get_canonical_form_slice()
# to allow it to determine the broadcast pattern correctly.
elif isinstance(v.owner.op, ScalarFromTensor):
elif isinstance(v.owner.op, (ScalarFromTensor, TensorFromScalar)):
return get_scalar_constant_value(v.owner.inputs[0])
elif isinstance(v.owner.op, scal.ScalarOp):
if isinstance(v.owner.op, scal.Second):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论